ctree.c 105.3 KB
Newer Older
C
Chris Mason 已提交
1
/*
C
Chris Mason 已提交
2
 * Copyright (C) 2007,2008 Oracle.  All rights reserved.
C
Chris Mason 已提交
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

19
#include <linux/sched.h>
20 21
#include "ctree.h"
#include "disk-io.h"
22
#include "transaction.h"
23
#include "print-tree.h"
24
#include "locking.h"
25

26 27 28
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
		      *root, struct btrfs_path *path, int level);
static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
29
		      *root, struct btrfs_key *ins_key,
30
		      struct btrfs_path *path, int data_size, int extend);
31 32
static int push_node_left(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root, struct extent_buffer *dst,
33
			  struct extent_buffer *src, int empty);
34 35 36 37
static int balance_node_right(struct btrfs_trans_handle *trans,
			      struct btrfs_root *root,
			      struct extent_buffer *dst_buf,
			      struct extent_buffer *src_buf);
38 39
static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		   struct btrfs_path *path, int level, int slot);
40

C
Chris Mason 已提交
41
struct btrfs_path *btrfs_alloc_path(void)
C
Chris Mason 已提交
42
{
C
Chris Mason 已提交
43
	struct btrfs_path *path;
J
Jeff Mahoney 已提交
44 45
	path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
	if (path)
46
		path->reada = 1;
C
Chris Mason 已提交
47
	return path;
C
Chris Mason 已提交
48 49
}

50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
/*
 * set all locked nodes in the path to blocking locks.  This should
 * be done before scheduling
 */
noinline void btrfs_set_path_blocking(struct btrfs_path *p)
{
	int i;
	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
		if (p->nodes[i] && p->locks[i])
			btrfs_set_lock_blocking(p->nodes[i]);
	}
}

/*
 * reset all the locked nodes in the patch to spinning locks.
65 66 67 68 69
 *
 * held is used to keep lockdep happy, when lockdep is enabled
 * we set held to a blocking lock before we go around and
 * retake all the spinlocks in the path.  You can safely use NULL
 * for held
70
 */
71 72
noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
					struct extent_buffer *held)
73 74
{
	int i;
75 76 77 78 79 80 81 82 83 84 85 86 87 88

#ifdef CONFIG_DEBUG_LOCK_ALLOC
	/* lockdep really cares that we take all of these spinlocks
	 * in the right order.  If any of the locks in the path are not
	 * currently blocking, it is going to complain.  So, make really
	 * really sure by forcing the path to blocking before we clear
	 * the path blocking.
	 */
	if (held)
		btrfs_set_lock_blocking(held);
	btrfs_set_path_blocking(p);
#endif

	for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
89 90 91
		if (p->nodes[i] && p->locks[i])
			btrfs_clear_lock_blocking(p->nodes[i]);
	}
92 93 94 95 96

#ifdef CONFIG_DEBUG_LOCK_ALLOC
	if (held)
		btrfs_clear_lock_blocking(held);
#endif
97 98
}

C
Chris Mason 已提交
99
/* this also releases the path */
C
Chris Mason 已提交
100
void btrfs_free_path(struct btrfs_path *p)
101
{
C
Chris Mason 已提交
102 103
	btrfs_release_path(NULL, p);
	kmem_cache_free(btrfs_path_cachep, p);
104 105
}

C
Chris Mason 已提交
106 107 108 109 110 111
/*
 * path release drops references on the extent buffers in the path
 * and it drops any locks held by this path
 *
 * It is safe to call this on paths that no locks or extent buffers held.
 */
C
Chris Mason 已提交
112
noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
113 114
{
	int i;
115

C
Chris Mason 已提交
116
	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
117
		p->slots[i] = 0;
118
		if (!p->nodes[i])
119 120 121 122 123
			continue;
		if (p->locks[i]) {
			btrfs_tree_unlock(p->nodes[i]);
			p->locks[i] = 0;
		}
124
		free_extent_buffer(p->nodes[i]);
125
		p->nodes[i] = NULL;
126 127 128
	}
}

C
Chris Mason 已提交
129 130 131 132 133 134 135 136 137 138
/*
 * safely gets a reference on the root node of a tree.  A lock
 * is not taken, so a concurrent writer may put a different node
 * at the root of the tree.  See btrfs_lock_root_node for the
 * looping required.
 *
 * The extent buffer returned by this has a reference taken, so
 * it won't disappear.  It may stop being the root of the tree
 * at any time because there are no locks held.
 */
139 140 141 142 143 144 145 146 147 148
struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
{
	struct extent_buffer *eb;
	spin_lock(&root->node_lock);
	eb = root->node;
	extent_buffer_get(eb);
	spin_unlock(&root->node_lock);
	return eb;
}

C
Chris Mason 已提交
149 150 151 152
/* loop around taking references on and locking the root node of the
 * tree until you end up with a lock on the root.  A locked buffer
 * is returned, with a reference held.
 */
153 154 155 156
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
{
	struct extent_buffer *eb;

C
Chris Mason 已提交
157
	while (1) {
158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173
		eb = btrfs_root_node(root);
		btrfs_tree_lock(eb);

		spin_lock(&root->node_lock);
		if (eb == root->node) {
			spin_unlock(&root->node_lock);
			break;
		}
		spin_unlock(&root->node_lock);

		btrfs_tree_unlock(eb);
		free_extent_buffer(eb);
	}
	return eb;
}

C
Chris Mason 已提交
174 175 176 177
/* cowonly root (everything not a reference counted cow subvolume), just get
 * put onto a simple dirty list.  transaction.c walks this to make sure they
 * get properly updated on disk.
 */
178 179 180 181 182 183 184 185
static void add_root_to_dirty_list(struct btrfs_root *root)
{
	if (root->track_dirty && list_empty(&root->dirty_list)) {
		list_add(&root->dirty_list,
			 &root->fs_info->dirty_cowonly_roots);
	}
}

C
Chris Mason 已提交
186 187 188 189 190
/*
 * used by snapshot creation to make a copy of a root for a tree with
 * a given objectid.  The buffer with the new root node is returned in
 * cow_ret, and this func returns zero on success or a negative error code.
 */
191 192 193 194 195 196 197 198 199
int btrfs_copy_root(struct btrfs_trans_handle *trans,
		      struct btrfs_root *root,
		      struct extent_buffer *buf,
		      struct extent_buffer **cow_ret, u64 new_root_objectid)
{
	struct extent_buffer *cow;
	u32 nritems;
	int ret = 0;
	int level;
200
	struct btrfs_root *new_root;
201

202 203 204 205 206 207
	new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
	if (!new_root)
		return -ENOMEM;

	memcpy(new_root, root, sizeof(*new_root));
	new_root->root_key.objectid = new_root_objectid;
208 209 210 211 212 213 214

	WARN_ON(root->ref_cows && trans->transid !=
		root->fs_info->running_transaction->transid);
	WARN_ON(root->ref_cows && trans->transid != root->last_trans);

	level = btrfs_header_level(buf);
	nritems = btrfs_header_nritems(buf);
Z
Zheng Yan 已提交
215 216 217 218

	cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
				     new_root_objectid, trans->transid,
				     level, buf->start, 0);
219 220
	if (IS_ERR(cow)) {
		kfree(new_root);
221
		return PTR_ERR(cow);
222
	}
223 224 225 226 227

	copy_extent_buffer(cow, buf, 0, 0, cow->len);
	btrfs_set_header_bytenr(cow, cow->start);
	btrfs_set_header_generation(cow, trans->transid);
	btrfs_set_header_owner(cow, new_root_objectid);
228
	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
229

Y
Yan Zheng 已提交
230 231 232 233
	write_extent_buffer(cow, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(cow),
			    BTRFS_FSID_SIZE);

234
	WARN_ON(btrfs_header_generation(buf) > trans->transid);
Z
Zheng Yan 已提交
235
	ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
236 237
	kfree(new_root);

238 239 240 241 242 243 244 245
	if (ret)
		return ret;

	btrfs_mark_buffer_dirty(cow);
	*cow_ret = cow;
	return 0;
}

C
Chris Mason 已提交
246
/*
C
Chris Mason 已提交
247 248 249 250
 * does the dirty work in cow of a single block.  The parent block (if
 * supplied) is updated to point to the new cow copy.  The new buffer is marked
 * dirty and returned locked.  If you modify the block it needs to be marked
 * dirty again.
C
Chris Mason 已提交
251 252 253
 *
 * search_start -- an allocation hint for the new block
 *
C
Chris Mason 已提交
254 255 256
 * empty_size -- a hint that you plan on doing more cow.  This is the size in
 * bytes the allocator should try to find free next to the block it returns.
 * This is just a hint and may be ignored by the allocator.
C
Chris Mason 已提交
257
 */
C
Chris Mason 已提交
258
static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
259 260 261 262
			     struct btrfs_root *root,
			     struct extent_buffer *buf,
			     struct extent_buffer *parent, int parent_slot,
			     struct extent_buffer **cow_ret,
263
			     u64 search_start, u64 empty_size)
C
Chris Mason 已提交
264
{
Z
Zheng Yan 已提交
265
	u64 parent_start;
266
	struct extent_buffer *cow;
267
	u32 nritems;
268
	int ret = 0;
269
	int level;
270
	int unlock_orig = 0;
271

272 273 274
	if (*cow_ret == buf)
		unlock_orig = 1;

275
	btrfs_assert_tree_locked(buf);
276

Z
Zheng Yan 已提交
277 278 279 280 281
	if (parent)
		parent_start = parent->start;
	else
		parent_start = 0;

282 283
	WARN_ON(root->ref_cows && trans->transid !=
		root->fs_info->running_transaction->transid);
284
	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
285

286 287
	level = btrfs_header_level(buf);
	nritems = btrfs_header_nritems(buf);
Z
Zheng Yan 已提交
288

289 290 291 292
	cow = btrfs_alloc_free_block(trans, root, buf->len,
				     parent_start, root->root_key.objectid,
				     trans->transid, level,
				     search_start, empty_size);
293 294
	if (IS_ERR(cow))
		return PTR_ERR(cow);
295

296 297
	/* cow is set to blocking by btrfs_init_new_buffer */

298
	copy_extent_buffer(cow, buf, 0, 0, cow->len);
299
	btrfs_set_header_bytenr(cow, cow->start);
300 301
	btrfs_set_header_generation(cow, trans->transid);
	btrfs_set_header_owner(cow, root->root_key.objectid);
302
	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
303

Y
Yan Zheng 已提交
304 305 306 307
	write_extent_buffer(cow, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(cow),
			    BTRFS_FSID_SIZE);

308 309
	WARN_ON(btrfs_header_generation(buf) > trans->transid);
	if (btrfs_header_generation(buf) != trans->transid) {
Z
Zheng Yan 已提交
310 311
		u32 nr_extents;
		ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
312 313
		if (ret)
			return ret;
Z
Zheng Yan 已提交
314 315 316

		ret = btrfs_cache_ref(trans, root, buf, nr_extents);
		WARN_ON(ret);
Z
Zheng Yan 已提交
317 318 319 320
	} else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
		/*
		 * There are only two places that can drop reference to
		 * tree blocks owned by living reloc trees, one is here,
Y
Yan Zheng 已提交
321
		 * the other place is btrfs_drop_subtree. In both places,
Z
Zheng Yan 已提交
322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337
		 * we check reference count while tree block is locked.
		 * Furthermore, if reference count is one, it won't get
		 * increased by someone else.
		 */
		u32 refs;
		ret = btrfs_lookup_extent_ref(trans, root, buf->start,
					      buf->len, &refs);
		BUG_ON(ret);
		if (refs == 1) {
			ret = btrfs_update_ref(trans, root, buf, cow,
					       0, nritems);
			clean_tree_block(trans, root, buf);
		} else {
			ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
		}
		BUG_ON(ret);
338
	} else {
Z
Zheng Yan 已提交
339 340 341
		ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
		if (ret)
			return ret;
342 343 344
		clean_tree_block(trans, root, buf);
	}

Z
Zheng Yan 已提交
345 346 347 348 349
	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
		ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
		WARN_ON(ret);
	}

C
Chris Mason 已提交
350
	if (buf == root->node) {
351 352 353
		WARN_ON(parent && parent != buf);

		spin_lock(&root->node_lock);
C
Chris Mason 已提交
354
		root->node = cow;
355
		extent_buffer_get(cow);
356 357
		spin_unlock(&root->node_lock);

C
Chris Mason 已提交
358
		if (buf != root->commit_root) {
359
			btrfs_free_extent(trans, root, buf->start,
Z
Zheng Yan 已提交
360 361 362
					  buf->len, buf->start,
					  root->root_key.objectid,
					  btrfs_header_generation(buf),
363
					  level, 1);
C
Chris Mason 已提交
364
		}
365
		free_extent_buffer(buf);
366
		add_root_to_dirty_list(root);
C
Chris Mason 已提交
367
	} else {
368
		btrfs_set_node_blockptr(parent, parent_slot,
369
					cow->start);
370 371 372
		WARN_ON(trans->transid == 0);
		btrfs_set_node_ptr_generation(parent, parent_slot,
					      trans->transid);
C
Chris Mason 已提交
373
		btrfs_mark_buffer_dirty(parent);
374
		WARN_ON(btrfs_header_generation(parent) != trans->transid);
375
		btrfs_free_extent(trans, root, buf->start, buf->len,
Z
Zheng Yan 已提交
376
				  parent_start, btrfs_header_owner(parent),
377
				  btrfs_header_generation(parent), level, 1);
C
Chris Mason 已提交
378
	}
379 380
	if (unlock_orig)
		btrfs_tree_unlock(buf);
381
	free_extent_buffer(buf);
C
Chris Mason 已提交
382
	btrfs_mark_buffer_dirty(cow);
C
Chris Mason 已提交
383
	*cow_ret = cow;
C
Chris Mason 已提交
384 385 386
	return 0;
}

C
Chris Mason 已提交
387 388 389 390 391
/*
 * cows a single block, see __btrfs_cow_block for the real work.
 * This version of it has extra checks so that a block isn't cow'd more than
 * once per transaction, as long as it hasn't been written yet
 */
C
Chris Mason 已提交
392
noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
393 394
		    struct btrfs_root *root, struct extent_buffer *buf,
		    struct extent_buffer *parent, int parent_slot,
395
		    struct extent_buffer **cow_ret)
396 397
{
	u64 search_start;
398
	int ret;
C
Chris Mason 已提交
399

400
	if (trans->transaction != root->fs_info->running_transaction) {
C
Chris Mason 已提交
401 402 403
		printk(KERN_CRIT "trans %llu running %llu\n",
		       (unsigned long long)trans->transid,
		       (unsigned long long)
404 405 406 407
		       root->fs_info->running_transaction->transid);
		WARN_ON(1);
	}
	if (trans->transid != root->fs_info->generation) {
C
Chris Mason 已提交
408 409 410
		printk(KERN_CRIT "trans %llu running %llu\n",
		       (unsigned long long)trans->transid,
		       (unsigned long long)root->fs_info->generation);
411 412
		WARN_ON(1);
	}
C
Chris Mason 已提交
413

414 415
	if (btrfs_header_generation(buf) == trans->transid &&
	    btrfs_header_owner(buf) == root->root_key.objectid &&
416
	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
417 418 419
		*cow_ret = buf;
		return 0;
	}
420

421
	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
422 423 424 425 426

	if (parent)
		btrfs_set_lock_blocking(parent);
	btrfs_set_lock_blocking(buf);

427
	ret = __btrfs_cow_block(trans, root, buf, parent,
428
				 parent_slot, cow_ret, search_start, 0);
429
	return ret;
430 431
}

C
Chris Mason 已提交
432 433 434 435
/*
 * helper function for defrag to decide if two blocks pointed to by a
 * node are actually close by
 */
436
static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
437
{
438
	if (blocknr < other && other - (blocknr + blocksize) < 32768)
439
		return 1;
440
	if (blocknr > other && blocknr - (other + blocksize) < 32768)
441 442 443 444
		return 1;
	return 0;
}

445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468
/*
 * compare two keys in a memcmp fashion
 */
static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
{
	struct btrfs_key k1;

	btrfs_disk_key_to_cpu(&k1, disk);

	if (k1.objectid > k2->objectid)
		return 1;
	if (k1.objectid < k2->objectid)
		return -1;
	if (k1.type > k2->type)
		return 1;
	if (k1.type < k2->type)
		return -1;
	if (k1.offset > k2->offset)
		return 1;
	if (k1.offset < k2->offset)
		return -1;
	return 0;
}

469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487
/*
 * same as comp_keys only with two btrfs_key's
 */
static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
{
	if (k1->objectid > k2->objectid)
		return 1;
	if (k1->objectid < k2->objectid)
		return -1;
	if (k1->type > k2->type)
		return 1;
	if (k1->type < k2->type)
		return -1;
	if (k1->offset > k2->offset)
		return 1;
	if (k1->offset < k2->offset)
		return -1;
	return 0;
}
488

C
Chris Mason 已提交
489 490 491 492 493
/*
 * this is used by the defrag code to go through all the
 * leaves pointed to by a node and reallocate them so that
 * disk order is close to key order
 */
494
int btrfs_realloc_node(struct btrfs_trans_handle *trans,
495
		       struct btrfs_root *root, struct extent_buffer *parent,
496 497
		       int start_slot, int cache_only, u64 *last_ret,
		       struct btrfs_key *progress)
498
{
499
	struct extent_buffer *cur;
500
	u64 blocknr;
501
	u64 gen;
502 503
	u64 search_start = *last_ret;
	u64 last_block = 0;
504 505 506 507 508
	u64 other;
	u32 parent_nritems;
	int end_slot;
	int i;
	int err = 0;
509
	int parent_level;
510 511
	int uptodate;
	u32 blocksize;
512 513
	int progress_passed = 0;
	struct btrfs_disk_key disk_key;
514

515 516 517 518
	parent_level = btrfs_header_level(parent);
	if (cache_only && parent_level != 1)
		return 0;

C
Chris Mason 已提交
519
	if (trans->transaction != root->fs_info->running_transaction)
520
		WARN_ON(1);
C
Chris Mason 已提交
521
	if (trans->transid != root->fs_info->generation)
522
		WARN_ON(1);
523

524 525
	parent_nritems = btrfs_header_nritems(parent);
	blocksize = btrfs_level_size(root, parent_level - 1);
526 527 528 529 530
	end_slot = parent_nritems;

	if (parent_nritems == 1)
		return 0;

531 532
	btrfs_set_lock_blocking(parent);

533 534
	for (i = start_slot; i < end_slot; i++) {
		int close = 1;
535

536 537 538 539 540 541 542 543
		if (!parent->map_token) {
			map_extent_buffer(parent,
					btrfs_node_key_ptr_offset(i),
					sizeof(struct btrfs_key_ptr),
					&parent->map_token, &parent->kaddr,
					&parent->map_start, &parent->map_len,
					KM_USER1);
		}
544 545 546 547 548
		btrfs_node_key(parent, &disk_key, i);
		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
			continue;

		progress_passed = 1;
549
		blocknr = btrfs_node_blockptr(parent, i);
550
		gen = btrfs_node_ptr_generation(parent, i);
551 552
		if (last_block == 0)
			last_block = blocknr;
553

554
		if (i > 0) {
555 556
			other = btrfs_node_blockptr(parent, i - 1);
			close = close_blocks(blocknr, other, blocksize);
557
		}
C
Chris Mason 已提交
558
		if (!close && i < end_slot - 2) {
559 560
			other = btrfs_node_blockptr(parent, i + 1);
			close = close_blocks(blocknr, other, blocksize);
561
		}
562 563
		if (close) {
			last_block = blocknr;
564
			continue;
565
		}
566 567 568 569 570
		if (parent->map_token) {
			unmap_extent_buffer(parent, parent->map_token,
					    KM_USER1);
			parent->map_token = NULL;
		}
571

572 573
		cur = btrfs_find_tree_block(root, blocknr, blocksize);
		if (cur)
574
			uptodate = btrfs_buffer_uptodate(cur, gen);
575 576
		else
			uptodate = 0;
577
		if (!cur || !uptodate) {
578
			if (cache_only) {
579
				free_extent_buffer(cur);
580 581
				continue;
			}
582 583
			if (!cur) {
				cur = read_tree_block(root, blocknr,
584
							 blocksize, gen);
585
			} else if (!uptodate) {
586
				btrfs_read_buffer(cur, gen);
587
			}
588
		}
589
		if (search_start == 0)
590
			search_start = last_block;
591

592
		btrfs_tree_lock(cur);
593
		btrfs_set_lock_blocking(cur);
594
		err = __btrfs_cow_block(trans, root, cur, parent, i,
595
					&cur, search_start,
596
					min(16 * blocksize,
597
					    (end_slot - i) * blocksize));
Y
Yan 已提交
598
		if (err) {
599
			btrfs_tree_unlock(cur);
600
			free_extent_buffer(cur);
601
			break;
Y
Yan 已提交
602
		}
603 604
		search_start = cur->start;
		last_block = cur->start;
605
		*last_ret = search_start;
606 607
		btrfs_tree_unlock(cur);
		free_extent_buffer(cur);
608
	}
609 610 611 612 613
	if (parent->map_token) {
		unmap_extent_buffer(parent, parent->map_token,
				    KM_USER1);
		parent->map_token = NULL;
	}
614 615 616
	return err;
}

C
Chris Mason 已提交
617 618 619 620 621
/*
 * The leaf data grows from end-to-front in the node.
 * this returns the address of the start of the last item,
 * which is the stop of the leaf data stack
 */
C
Chris Mason 已提交
622
static inline unsigned int leaf_data_end(struct btrfs_root *root,
623
					 struct extent_buffer *leaf)
624
{
625
	u32 nr = btrfs_header_nritems(leaf);
626
	if (nr == 0)
C
Chris Mason 已提交
627
		return BTRFS_LEAF_DATA_SIZE(root);
628
	return btrfs_item_offset_nr(leaf, nr - 1);
629 630
}

C
Chris Mason 已提交
631 632 633 634
/*
 * extra debugging checks to make sure all the items in a key are
 * well formed and in the proper order
 */
C
Chris Mason 已提交
635 636
static int check_node(struct btrfs_root *root, struct btrfs_path *path,
		      int level)
C
Chris Mason 已提交
637
{
638 639 640 641
	struct extent_buffer *parent = NULL;
	struct extent_buffer *node = path->nodes[level];
	struct btrfs_disk_key parent_key;
	struct btrfs_disk_key node_key;
C
Chris Mason 已提交
642
	int parent_slot;
643 644
	int slot;
	struct btrfs_key cpukey;
645
	u32 nritems = btrfs_header_nritems(node);
C
Chris Mason 已提交
646 647

	if (path->nodes[level + 1])
648
		parent = path->nodes[level + 1];
A
Aneesh 已提交
649

650
	slot = path->slots[level];
651 652
	BUG_ON(nritems == 0);
	if (parent) {
A
Aneesh 已提交
653
		parent_slot = path->slots[level + 1];
654 655 656
		btrfs_node_key(parent, &parent_key, parent_slot);
		btrfs_node_key(node, &node_key, 0);
		BUG_ON(memcmp(&parent_key, &node_key,
C
Chris Mason 已提交
657
			      sizeof(struct btrfs_disk_key)));
658
		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
659
		       btrfs_header_bytenr(node));
C
Chris Mason 已提交
660
	}
C
Chris Mason 已提交
661
	BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
662
	if (slot != 0) {
663 664 665
		btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
		btrfs_node_key(node, &node_key, slot);
		BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
666 667
	}
	if (slot < nritems - 1) {
668 669 670
		btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
		btrfs_node_key(node, &node_key, slot);
		BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
C
Chris Mason 已提交
671 672 673 674
	}
	return 0;
}

C
Chris Mason 已提交
675 676 677 678
/*
 * extra checking to make sure all the items in a leaf are
 * well formed and in the proper order
 */
C
Chris Mason 已提交
679 680
static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
		      int level)
C
Chris Mason 已提交
681
{
682 683
	struct extent_buffer *leaf = path->nodes[level];
	struct extent_buffer *parent = NULL;
C
Chris Mason 已提交
684
	int parent_slot;
685
	struct btrfs_key cpukey;
686 687 688
	struct btrfs_disk_key parent_key;
	struct btrfs_disk_key leaf_key;
	int slot = path->slots[0];
689

690
	u32 nritems = btrfs_header_nritems(leaf);
C
Chris Mason 已提交
691 692

	if (path->nodes[level + 1])
693
		parent = path->nodes[level + 1];
694 695 696 697 698

	if (nritems == 0)
		return 0;

	if (parent) {
A
Aneesh 已提交
699
		parent_slot = path->slots[level + 1];
700 701
		btrfs_node_key(parent, &parent_key, parent_slot);
		btrfs_item_key(leaf, &leaf_key, 0);
702

703
		BUG_ON(memcmp(&parent_key, &leaf_key,
C
Chris Mason 已提交
704
		       sizeof(struct btrfs_disk_key)));
705
		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
706
		       btrfs_header_bytenr(leaf));
707 708 709 710 711 712
	}
	if (slot != 0 && slot < nritems - 1) {
		btrfs_item_key(leaf, &leaf_key, slot);
		btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
		if (comp_keys(&leaf_key, &cpukey) <= 0) {
			btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
713
			printk(KERN_CRIT "slot %d offset bad key\n", slot);
714 715 716 717 718
			BUG_ON(1);
		}
		if (btrfs_item_offset_nr(leaf, slot - 1) !=
		       btrfs_item_end_nr(leaf, slot)) {
			btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
719
			printk(KERN_CRIT "slot %d offset bad\n", slot);
720 721
			BUG_ON(1);
		}
722 723
	}
	if (slot < nritems - 1) {
724 725 726 727 728 729
		btrfs_item_key(leaf, &leaf_key, slot);
		btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
		BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
		if (btrfs_item_offset_nr(leaf, slot) !=
			btrfs_item_end_nr(leaf, slot + 1)) {
			btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
730
			printk(KERN_CRIT "slot %d offset bad\n", slot);
731 732
			BUG_ON(1);
		}
C
Chris Mason 已提交
733
	}
734 735
	BUG_ON(btrfs_item_offset_nr(leaf, 0) +
	       btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
C
Chris Mason 已提交
736 737 738
	return 0;
}

C
Chris Mason 已提交
739
static noinline int check_block(struct btrfs_root *root,
740
				struct btrfs_path *path, int level)
C
Chris Mason 已提交
741
{
742
	return 0;
C
Chris Mason 已提交
743
	if (level == 0)
C
Chris Mason 已提交
744 745
		return check_leaf(root, path, level);
	return check_node(root, path, level);
C
Chris Mason 已提交
746 747
}

C
Chris Mason 已提交
748
/*
749 750 751
 * search for key in the extent_buffer.  The items start at offset p,
 * and they are item_size apart.  There are 'max' items in p.
 *
C
Chris Mason 已提交
752 753 754 755 756 757
 * the slot in the array is returned via slot, and it points to
 * the place where you would insert key if it is not found in
 * the array.
 *
 * slot may point to max if the key is bigger than all of the keys
 */
758 759 760 761
static noinline int generic_bin_search(struct extent_buffer *eb,
				       unsigned long p,
				       int item_size, struct btrfs_key *key,
				       int max, int *slot)
762 763 764 765 766
{
	int low = 0;
	int high = max;
	int mid;
	int ret;
767
	struct btrfs_disk_key *tmp = NULL;
768 769 770 771 772 773
	struct btrfs_disk_key unaligned;
	unsigned long offset;
	char *map_token = NULL;
	char *kaddr = NULL;
	unsigned long map_start = 0;
	unsigned long map_len = 0;
774
	int err;
775

C
Chris Mason 已提交
776
	while (low < high) {
777
		mid = (low + high) / 2;
778 779 780 781 782
		offset = p + mid * item_size;

		if (!map_token || offset < map_start ||
		    (offset + sizeof(struct btrfs_disk_key)) >
		    map_start + map_len) {
783
			if (map_token) {
784
				unmap_extent_buffer(eb, map_token, KM_USER0);
785 786
				map_token = NULL;
			}
787 788

			err = map_private_extent_buffer(eb, offset,
789 790 791 792 793 794 795 796 797 798 799 800
						sizeof(struct btrfs_disk_key),
						&map_token, &kaddr,
						&map_start, &map_len, KM_USER0);

			if (!err) {
				tmp = (struct btrfs_disk_key *)(kaddr + offset -
							map_start);
			} else {
				read_extent_buffer(eb, &unaligned,
						   offset, sizeof(unaligned));
				tmp = &unaligned;
			}
801 802 803 804 805

		} else {
			tmp = (struct btrfs_disk_key *)(kaddr + offset -
							map_start);
		}
806 807 808 809 810 811 812 813
		ret = comp_keys(tmp, key);

		if (ret < 0)
			low = mid + 1;
		else if (ret > 0)
			high = mid;
		else {
			*slot = mid;
814 815
			if (map_token)
				unmap_extent_buffer(eb, map_token, KM_USER0);
816 817 818 819
			return 0;
		}
	}
	*slot = low;
820 821
	if (map_token)
		unmap_extent_buffer(eb, map_token, KM_USER0);
822 823 824
	return 1;
}

C
Chris Mason 已提交
825 826 827 828
/*
 * simple bin_search frontend that does the right thing for
 * leaves vs nodes
 */
829 830
static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
		      int level, int *slot)
831
{
832 833 834
	if (level == 0) {
		return generic_bin_search(eb,
					  offsetof(struct btrfs_leaf, items),
C
Chris Mason 已提交
835
					  sizeof(struct btrfs_item),
836
					  key, btrfs_header_nritems(eb),
837
					  slot);
838
	} else {
839 840
		return generic_bin_search(eb,
					  offsetof(struct btrfs_node, ptrs),
C
Chris Mason 已提交
841
					  sizeof(struct btrfs_key_ptr),
842
					  key, btrfs_header_nritems(eb),
843
					  slot);
844 845 846 847
	}
	return -1;
}

C
Chris Mason 已提交
848 849 850 851
/* given a node and slot number, this reads the blocks it points to.  The
 * extent buffer is returned with a reference taken (but unlocked).
 * NULL is returned on error.
 */
852
static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
853
				   struct extent_buffer *parent, int slot)
854
{
855
	int level = btrfs_header_level(parent);
856 857
	if (slot < 0)
		return NULL;
858
	if (slot >= btrfs_header_nritems(parent))
859
		return NULL;
860 861 862

	BUG_ON(level == 0);

863
	return read_tree_block(root, btrfs_node_blockptr(parent, slot),
864 865
		       btrfs_level_size(root, level - 1),
		       btrfs_node_ptr_generation(parent, slot));
866 867
}

C
Chris Mason 已提交
868 869 870 871 872
/*
 * node level balancing, used to make sure nodes are in proper order for
 * item deletion.  We balance from the top down, so we have to make sure
 * that a deletion won't leave an node completely empty later on.
 */
873
static noinline int balance_level(struct btrfs_trans_handle *trans,
874 875
			 struct btrfs_root *root,
			 struct btrfs_path *path, int level)
876
{
877 878 879 880
	struct extent_buffer *right = NULL;
	struct extent_buffer *mid;
	struct extent_buffer *left = NULL;
	struct extent_buffer *parent = NULL;
881 882 883 884
	int ret = 0;
	int wret;
	int pslot;
	int orig_slot = path->slots[level];
885
	int err_on_enospc = 0;
886
	u64 orig_ptr;
887 888 889 890

	if (level == 0)
		return 0;

891
	mid = path->nodes[level];
892

893
	WARN_ON(!path->locks[level]);
894 895
	WARN_ON(btrfs_header_generation(mid) != trans->transid);

896
	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
897

C
Chris Mason 已提交
898
	if (level < BTRFS_MAX_LEVEL - 1)
899
		parent = path->nodes[level + 1];
900 901
	pslot = path->slots[level + 1];

C
Chris Mason 已提交
902 903 904 905
	/*
	 * deal with the case where there is only one pointer in the root
	 * by promoting the node below to a root
	 */
906 907
	if (!parent) {
		struct extent_buffer *child;
908

909
		if (btrfs_header_nritems(mid) != 1)
910 911 912
			return 0;

		/* promote the child to a root */
913
		child = read_node_slot(root, mid, 0);
914
		BUG_ON(!child);
915
		btrfs_tree_lock(child);
916
		btrfs_set_lock_blocking(child);
917
		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
918 919
		BUG_ON(ret);

920
		spin_lock(&root->node_lock);
921
		root->node = child;
922 923
		spin_unlock(&root->node_lock);

Z
Zheng Yan 已提交
924 925 926
		ret = btrfs_update_extent_ref(trans, root, child->start,
					      mid->start, child->start,
					      root->root_key.objectid,
927
					      trans->transid, level - 1);
Z
Zheng Yan 已提交
928 929
		BUG_ON(ret);

930
		add_root_to_dirty_list(root);
931
		btrfs_tree_unlock(child);
932

933
		path->locks[level] = 0;
934
		path->nodes[level] = NULL;
935
		clean_tree_block(trans, root, mid);
936
		btrfs_tree_unlock(mid);
937
		/* once for the path */
938
		free_extent_buffer(mid);
939
		ret = btrfs_free_extent(trans, root, mid->start, mid->len,
Z
Zheng Yan 已提交
940
					mid->start, root->root_key.objectid,
941 942
					btrfs_header_generation(mid),
					level, 1);
943
		/* once for the root ptr */
944
		free_extent_buffer(mid);
945
		return ret;
946
	}
947
	if (btrfs_header_nritems(mid) >
C
Chris Mason 已提交
948
	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
949 950
		return 0;

951
	if (btrfs_header_nritems(mid) < 2)
952 953
		err_on_enospc = 1;

954 955
	left = read_node_slot(root, parent, pslot - 1);
	if (left) {
956
		btrfs_tree_lock(left);
957
		btrfs_set_lock_blocking(left);
958
		wret = btrfs_cow_block(trans, root, left,
959
				       parent, pslot - 1, &left);
960 961 962 963
		if (wret) {
			ret = wret;
			goto enospc;
		}
964
	}
965 966
	right = read_node_slot(root, parent, pslot + 1);
	if (right) {
967
		btrfs_tree_lock(right);
968
		btrfs_set_lock_blocking(right);
969
		wret = btrfs_cow_block(trans, root, right,
970
				       parent, pslot + 1, &right);
971 972 973 974 975 976 977
		if (wret) {
			ret = wret;
			goto enospc;
		}
	}

	/* first, try to make some room in the middle buffer */
978 979
	if (left) {
		orig_slot += btrfs_header_nritems(left);
980
		wret = push_node_left(trans, root, left, mid, 1);
981 982
		if (wret < 0)
			ret = wret;
983
		if (btrfs_header_nritems(mid) < 2)
984
			err_on_enospc = 1;
985
	}
986 987 988 989

	/*
	 * then try to empty the right most buffer into the middle
	 */
990
	if (right) {
991
		wret = push_node_left(trans, root, mid, right, 1);
992
		if (wret < 0 && wret != -ENOSPC)
993
			ret = wret;
994
		if (btrfs_header_nritems(right) == 0) {
995
			u64 bytenr = right->start;
996
			u64 generation = btrfs_header_generation(parent);
997 998
			u32 blocksize = right->len;

999
			clean_tree_block(trans, root, right);
1000
			btrfs_tree_unlock(right);
1001
			free_extent_buffer(right);
1002
			right = NULL;
1003 1004
			wret = del_ptr(trans, root, path, level + 1, pslot +
				       1);
1005 1006
			if (wret)
				ret = wret;
1007
			wret = btrfs_free_extent(trans, root, bytenr,
Z
Zheng Yan 已提交
1008
						 blocksize, parent->start,
1009
						 btrfs_header_owner(parent),
1010
						 generation, level, 1);
1011 1012 1013
			if (wret)
				ret = wret;
		} else {
1014 1015 1016 1017
			struct btrfs_disk_key right_key;
			btrfs_node_key(right, &right_key, 0);
			btrfs_set_node_key(parent, &right_key, pslot + 1);
			btrfs_mark_buffer_dirty(parent);
1018 1019
		}
	}
1020
	if (btrfs_header_nritems(mid) == 1) {
1021 1022 1023 1024 1025 1026 1027 1028 1029
		/*
		 * we're not allowed to leave a node with one item in the
		 * tree during a delete.  A deletion from lower in the tree
		 * could try to delete the only pointer in this node.
		 * So, pull some keys from the left.
		 * There has to be a left pointer at this point because
		 * otherwise we would have pulled some pointers from the
		 * right
		 */
1030 1031
		BUG_ON(!left);
		wret = balance_node_right(trans, root, mid, left);
1032
		if (wret < 0) {
1033
			ret = wret;
1034 1035
			goto enospc;
		}
1036 1037 1038 1039 1040
		if (wret == 1) {
			wret = push_node_left(trans, root, left, mid, 1);
			if (wret < 0)
				ret = wret;
		}
1041 1042
		BUG_ON(wret == 1);
	}
1043
	if (btrfs_header_nritems(mid) == 0) {
1044
		/* we've managed to empty the middle node, drop it */
1045
		u64 root_gen = btrfs_header_generation(parent);
1046 1047
		u64 bytenr = mid->start;
		u32 blocksize = mid->len;
1048

1049
		clean_tree_block(trans, root, mid);
1050
		btrfs_tree_unlock(mid);
1051
		free_extent_buffer(mid);
1052
		mid = NULL;
1053
		wret = del_ptr(trans, root, path, level + 1, pslot);
1054 1055
		if (wret)
			ret = wret;
1056
		wret = btrfs_free_extent(trans, root, bytenr, blocksize,
Z
Zheng Yan 已提交
1057
					 parent->start,
1058
					 btrfs_header_owner(parent),
1059
					 root_gen, level, 1);
1060 1061
		if (wret)
			ret = wret;
1062 1063
	} else {
		/* update the parent key to reflect our changes */
1064 1065 1066 1067
		struct btrfs_disk_key mid_key;
		btrfs_node_key(mid, &mid_key, 0);
		btrfs_set_node_key(parent, &mid_key, pslot);
		btrfs_mark_buffer_dirty(parent);
1068
	}
1069

1070
	/* update the path */
1071 1072 1073
	if (left) {
		if (btrfs_header_nritems(left) > orig_slot) {
			extent_buffer_get(left);
1074
			/* left was locked after cow */
1075
			path->nodes[level] = left;
1076 1077
			path->slots[level + 1] -= 1;
			path->slots[level] = orig_slot;
1078 1079
			if (mid) {
				btrfs_tree_unlock(mid);
1080
				free_extent_buffer(mid);
1081
			}
1082
		} else {
1083
			orig_slot -= btrfs_header_nritems(left);
1084 1085 1086
			path->slots[level] = orig_slot;
		}
	}
1087
	/* double check we haven't messed things up */
C
Chris Mason 已提交
1088
	check_block(root, path, level);
C
Chris Mason 已提交
1089
	if (orig_ptr !=
1090
	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1091
		BUG();
1092
enospc:
1093 1094
	if (right) {
		btrfs_tree_unlock(right);
1095
		free_extent_buffer(right);
1096 1097 1098 1099
	}
	if (left) {
		if (path->nodes[level] != left)
			btrfs_tree_unlock(left);
1100
		free_extent_buffer(left);
1101
	}
1102 1103 1104
	return ret;
}

C
Chris Mason 已提交
1105 1106 1107 1108
/* Node balancing for insertion.  Here we only split or push nodes around
 * when they are completely full.  This is also done top down, so we
 * have to be pessimistic.
 */
C
Chris Mason 已提交
1109
static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1110 1111
					  struct btrfs_root *root,
					  struct btrfs_path *path, int level)
1112
{
1113 1114 1115 1116
	struct extent_buffer *right = NULL;
	struct extent_buffer *mid;
	struct extent_buffer *left = NULL;
	struct extent_buffer *parent = NULL;
1117 1118 1119 1120 1121 1122 1123 1124 1125
	int ret = 0;
	int wret;
	int pslot;
	int orig_slot = path->slots[level];
	u64 orig_ptr;

	if (level == 0)
		return 1;

1126
	mid = path->nodes[level];
1127
	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1128 1129 1130
	orig_ptr = btrfs_node_blockptr(mid, orig_slot);

	if (level < BTRFS_MAX_LEVEL - 1)
1131
		parent = path->nodes[level + 1];
1132 1133
	pslot = path->slots[level + 1];

1134
	if (!parent)
1135 1136
		return 1;

1137
	left = read_node_slot(root, parent, pslot - 1);
1138 1139

	/* first, try to make some room in the middle buffer */
1140
	if (left) {
1141
		u32 left_nr;
1142 1143

		btrfs_tree_lock(left);
1144 1145
		btrfs_set_lock_blocking(left);

1146
		left_nr = btrfs_header_nritems(left);
C
Chris Mason 已提交
1147 1148 1149
		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
			wret = 1;
		} else {
1150
			ret = btrfs_cow_block(trans, root, left, parent,
1151
					      pslot - 1, &left);
1152 1153 1154 1155
			if (ret)
				wret = 1;
			else {
				wret = push_node_left(trans, root,
1156
						      left, mid, 0);
1157
			}
C
Chris Mason 已提交
1158
		}
1159 1160 1161
		if (wret < 0)
			ret = wret;
		if (wret == 0) {
1162
			struct btrfs_disk_key disk_key;
1163
			orig_slot += left_nr;
1164 1165 1166 1167 1168
			btrfs_node_key(mid, &disk_key, 0);
			btrfs_set_node_key(parent, &disk_key, pslot);
			btrfs_mark_buffer_dirty(parent);
			if (btrfs_header_nritems(left) > orig_slot) {
				path->nodes[level] = left;
1169 1170
				path->slots[level + 1] -= 1;
				path->slots[level] = orig_slot;
1171
				btrfs_tree_unlock(mid);
1172
				free_extent_buffer(mid);
1173 1174
			} else {
				orig_slot -=
1175
					btrfs_header_nritems(left);
1176
				path->slots[level] = orig_slot;
1177
				btrfs_tree_unlock(left);
1178
				free_extent_buffer(left);
1179 1180 1181
			}
			return 0;
		}
1182
		btrfs_tree_unlock(left);
1183
		free_extent_buffer(left);
1184
	}
1185
	right = read_node_slot(root, parent, pslot + 1);
1186 1187 1188 1189

	/*
	 * then try to empty the right most buffer into the middle
	 */
1190
	if (right) {
C
Chris Mason 已提交
1191
		u32 right_nr;
1192

1193
		btrfs_tree_lock(right);
1194 1195
		btrfs_set_lock_blocking(right);

1196
		right_nr = btrfs_header_nritems(right);
C
Chris Mason 已提交
1197 1198 1199
		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
			wret = 1;
		} else {
1200 1201
			ret = btrfs_cow_block(trans, root, right,
					      parent, pslot + 1,
1202
					      &right);
1203 1204 1205 1206
			if (ret)
				wret = 1;
			else {
				wret = balance_node_right(trans, root,
1207
							  right, mid);
1208
			}
C
Chris Mason 已提交
1209
		}
1210 1211 1212
		if (wret < 0)
			ret = wret;
		if (wret == 0) {
1213 1214 1215 1216 1217 1218 1219 1220
			struct btrfs_disk_key disk_key;

			btrfs_node_key(right, &disk_key, 0);
			btrfs_set_node_key(parent, &disk_key, pslot + 1);
			btrfs_mark_buffer_dirty(parent);

			if (btrfs_header_nritems(mid) <= orig_slot) {
				path->nodes[level] = right;
1221 1222
				path->slots[level + 1] += 1;
				path->slots[level] = orig_slot -
1223
					btrfs_header_nritems(mid);
1224
				btrfs_tree_unlock(mid);
1225
				free_extent_buffer(mid);
1226
			} else {
1227
				btrfs_tree_unlock(right);
1228
				free_extent_buffer(right);
1229 1230 1231
			}
			return 0;
		}
1232
		btrfs_tree_unlock(right);
1233
		free_extent_buffer(right);
1234 1235 1236 1237
	}
	return 1;
}

1238
/*
C
Chris Mason 已提交
1239 1240
 * readahead one full node of leaves, finding things that are close
 * to the block in 'slot', and triggering ra on them.
1241
 */
1242 1243 1244
static noinline void reada_for_search(struct btrfs_root *root,
				      struct btrfs_path *path,
				      int level, int slot, u64 objectid)
1245
{
1246
	struct extent_buffer *node;
1247
	struct btrfs_disk_key disk_key;
1248 1249
	u32 nritems;
	u64 search;
1250
	u64 target;
1251
	u64 nread = 0;
1252
	int direction = path->reada;
1253
	struct extent_buffer *eb;
1254 1255 1256
	u32 nr;
	u32 blocksize;
	u32 nscan = 0;
1257

1258
	if (level != 1)
1259 1260 1261
		return;

	if (!path->nodes[level])
1262 1263
		return;

1264
	node = path->nodes[level];
1265

1266
	search = btrfs_node_blockptr(node, slot);
1267 1268
	blocksize = btrfs_level_size(root, level - 1);
	eb = btrfs_find_tree_block(root, search, blocksize);
1269 1270
	if (eb) {
		free_extent_buffer(eb);
1271 1272 1273
		return;
	}

1274
	target = search;
1275

1276
	nritems = btrfs_header_nritems(node);
1277
	nr = slot;
C
Chris Mason 已提交
1278
	while (1) {
1279 1280 1281 1282 1283 1284 1285 1286
		if (direction < 0) {
			if (nr == 0)
				break;
			nr--;
		} else if (direction > 0) {
			nr++;
			if (nr >= nritems)
				break;
1287
		}
1288 1289 1290 1291 1292
		if (path->reada < 0 && objectid) {
			btrfs_node_key(node, &disk_key, nr);
			if (btrfs_disk_key_objectid(&disk_key) != objectid)
				break;
		}
1293
		search = btrfs_node_blockptr(node, nr);
1294 1295
		if ((search <= target && target - search <= 65536) ||
		    (search > target && search - target <= 65536)) {
1296 1297
			readahead_tree_block(root, search, blocksize,
				     btrfs_node_ptr_generation(node, nr));
1298 1299 1300
			nread += blocksize;
		}
		nscan++;
1301
		if ((nread > 65536 || nscan > 32))
1302
			break;
1303 1304
	}
}
1305

1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367
/*
 * returns -EAGAIN if it had to drop the path, or zero if everything was in
 * cache
 */
static noinline int reada_for_balance(struct btrfs_root *root,
				      struct btrfs_path *path, int level)
{
	int slot;
	int nritems;
	struct extent_buffer *parent;
	struct extent_buffer *eb;
	u64 gen;
	u64 block1 = 0;
	u64 block2 = 0;
	int ret = 0;
	int blocksize;

	parent = path->nodes[level - 1];
	if (!parent)
		return 0;

	nritems = btrfs_header_nritems(parent);
	slot = path->slots[level];
	blocksize = btrfs_level_size(root, level);

	if (slot > 0) {
		block1 = btrfs_node_blockptr(parent, slot - 1);
		gen = btrfs_node_ptr_generation(parent, slot - 1);
		eb = btrfs_find_tree_block(root, block1, blocksize);
		if (eb && btrfs_buffer_uptodate(eb, gen))
			block1 = 0;
		free_extent_buffer(eb);
	}
	if (slot < nritems) {
		block2 = btrfs_node_blockptr(parent, slot + 1);
		gen = btrfs_node_ptr_generation(parent, slot + 1);
		eb = btrfs_find_tree_block(root, block2, blocksize);
		if (eb && btrfs_buffer_uptodate(eb, gen))
			block2 = 0;
		free_extent_buffer(eb);
	}
	if (block1 || block2) {
		ret = -EAGAIN;
		btrfs_release_path(root, path);
		if (block1)
			readahead_tree_block(root, block1, blocksize, 0);
		if (block2)
			readahead_tree_block(root, block2, blocksize, 0);

		if (block1) {
			eb = read_tree_block(root, block1, blocksize, 0);
			free_extent_buffer(eb);
		}
		if (block1) {
			eb = read_tree_block(root, block2, blocksize, 0);
			free_extent_buffer(eb);
		}
	}
	return ret;
}


C
Chris Mason 已提交
1368
/*
C
Chris Mason 已提交
1369 1370 1371 1372
 * when we walk down the tree, it is usually safe to unlock the higher layers
 * in the tree.  The exceptions are when our path goes through slot 0, because
 * operations on the tree might require changing key pointers higher up in the
 * tree.
C
Chris Mason 已提交
1373
 *
C
Chris Mason 已提交
1374 1375 1376
 * callers might also have set path->keep_locks, which tells this code to keep
 * the lock if the path points to the last slot in the block.  This is part of
 * walking through the tree, and selecting the next slot in the higher block.
C
Chris Mason 已提交
1377
 *
C
Chris Mason 已提交
1378 1379
 * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
 * if lowest_unlock is 1, level 0 won't be unlocked
C
Chris Mason 已提交
1380
 */
1381 1382
static noinline void unlock_up(struct btrfs_path *path, int level,
			       int lowest_unlock)
1383 1384 1385
{
	int i;
	int skip_level = level;
1386
	int no_skips = 0;
1387 1388 1389 1390 1391 1392 1393
	struct extent_buffer *t;

	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
		if (!path->nodes[i])
			break;
		if (!path->locks[i])
			break;
1394
		if (!no_skips && path->slots[i] == 0) {
1395 1396 1397
			skip_level = i + 1;
			continue;
		}
1398
		if (!no_skips && path->keep_locks) {
1399 1400 1401
			u32 nritems;
			t = path->nodes[i];
			nritems = btrfs_header_nritems(t);
1402
			if (nritems < 1 || path->slots[i] >= nritems - 1) {
1403 1404 1405 1406
				skip_level = i + 1;
				continue;
			}
		}
1407 1408 1409
		if (skip_level < i && i >= lowest_unlock)
			no_skips = 1;

1410 1411 1412 1413 1414 1415 1416 1417
		t = path->nodes[i];
		if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
			btrfs_tree_unlock(t);
			path->locks[i] = 0;
		}
	}
}

1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435
/*
 * This releases any locks held in the path starting at level and
 * going all the way up to the root.
 *
 * btrfs_search_slot will keep the lock held on higher nodes in a few
 * corner cases, such as COW of the block at slot zero in the node.  This
 * ignores those rules, and it should only be called when there are no
 * more updates to be done higher up in the tree.
 */
noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
{
	int i;

	if (path->keep_locks || path->lowest_level)
		return;

	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
		if (!path->nodes[i])
1436
			continue;
1437
		if (!path->locks[i])
1438
			continue;
1439 1440 1441 1442 1443
		btrfs_tree_unlock(path->nodes[i]);
		path->locks[i] = 0;
	}
}

C
Chris Mason 已提交
1444 1445 1446 1447 1448 1449
/*
 * look for key in the tree.  path is filled in with nodes along the way
 * if key is found, we return zero and you can find the item in the leaf
 * level of the path (level 0)
 *
 * If the key isn't found, the path points to the slot where it should
C
Chris Mason 已提交
1450 1451
 * be inserted, and 1 is returned.  If there are other errors during the
 * search a negative error number is returned.
C
Chris Mason 已提交
1452 1453 1454 1455
 *
 * if ins_len > 0, nodes and leaves will be split as we walk down the
 * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
 * possible)
C
Chris Mason 已提交
1456
 */
1457 1458 1459
int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
		      *root, struct btrfs_key *key, struct btrfs_path *p, int
		      ins_len, int cow)
1460
{
1461
	struct extent_buffer *b;
1462
	struct extent_buffer *tmp;
1463 1464 1465
	int slot;
	int ret;
	int level;
1466
	int should_reada = p->reada;
1467
	int lowest_unlock = 1;
1468
	int blocksize;
1469
	u8 lowest_level = 0;
1470 1471
	u64 blocknr;
	u64 gen;
1472

1473
	lowest_level = p->lowest_level;
1474
	WARN_ON(lowest_level && ins_len > 0);
C
Chris Mason 已提交
1475
	WARN_ON(p->nodes[0] != NULL);
1476

1477 1478
	if (ins_len < 0)
		lowest_unlock = 2;
1479

1480
again:
1481 1482 1483 1484
	if (p->skip_locking)
		b = btrfs_root_node(root);
	else
		b = btrfs_lock_root_node(root);
1485

1486
	while (b) {
1487
		level = btrfs_header_level(b);
1488 1489 1490 1491 1492 1493 1494 1495 1496

		/*
		 * setup the path here so we can release it under lock
		 * contention with the cow code
		 */
		p->nodes[level] = b;
		if (!p->skip_locking)
			p->locks[level] = 1;

C
Chris Mason 已提交
1497 1498
		if (cow) {
			int wret;
1499 1500 1501

			/* is a cow on this block not required */
			if (btrfs_header_generation(b) == trans->transid &&
1502
			    btrfs_header_owner(b) == root->root_key.objectid &&
1503 1504 1505
			    !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
				goto cow_done;
			}
1506 1507
			btrfs_set_path_blocking(p);

C
Chris Mason 已提交
1508 1509
			wret = btrfs_cow_block(trans, root, b,
					       p->nodes[level + 1],
1510
					       p->slots[level + 1], &b);
1511
			if (wret) {
1512
				free_extent_buffer(b);
1513 1514
				ret = wret;
				goto done;
1515
			}
C
Chris Mason 已提交
1516
		}
1517
cow_done:
C
Chris Mason 已提交
1518
		BUG_ON(!cow && ins_len);
1519
		if (level != btrfs_header_level(b))
C
Chris Mason 已提交
1520
			WARN_ON(1);
1521
		level = btrfs_header_level(b);
1522

1523
		p->nodes[level] = b;
1524 1525
		if (!p->skip_locking)
			p->locks[level] = 1;
1526

1527
		btrfs_clear_path_blocking(p, NULL);
1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542

		/*
		 * we have a lock on b and as long as we aren't changing
		 * the tree, there is no way to for the items in b to change.
		 * It is safe to drop the lock on our parent before we
		 * go through the expensive btree search on b.
		 *
		 * If cow is true, then we might be changing slot zero,
		 * which may require changing the parent.  So, we can't
		 * drop the lock until after we know which slot we're
		 * operating on.
		 */
		if (!cow)
			btrfs_unlock_up_safe(p, level + 1);

C
Chris Mason 已提交
1543
		ret = check_block(root, p, level);
1544 1545 1546 1547
		if (ret) {
			ret = -1;
			goto done;
		}
1548

1549
		ret = bin_search(b, key, level, &slot);
1550

1551
		if (level != 0) {
1552 1553 1554
			if (ret && slot > 0)
				slot -= 1;
			p->slots[level] = slot;
1555 1556
			if ((p->search_for_split || ins_len > 0) &&
			    btrfs_header_nritems(b) >=
1557
			    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1558 1559 1560 1561 1562 1563 1564 1565
				int sret;

				sret = reada_for_balance(root, p, level);
				if (sret)
					goto again;

				btrfs_set_path_blocking(p);
				sret = split_node(trans, root, p, level);
1566
				btrfs_clear_path_blocking(p, NULL);
1567

C
Chris Mason 已提交
1568
				BUG_ON(sret > 0);
1569 1570 1571 1572
				if (sret) {
					ret = sret;
					goto done;
				}
C
Chris Mason 已提交
1573 1574
				b = p->nodes[level];
				slot = p->slots[level];
1575 1576 1577
			} else if (ins_len < 0 &&
				   btrfs_header_nritems(b) <
				   BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
1578 1579 1580 1581 1582 1583 1584 1585
				int sret;

				sret = reada_for_balance(root, p, level);
				if (sret)
					goto again;

				btrfs_set_path_blocking(p);
				sret = balance_level(trans, root, p, level);
1586
				btrfs_clear_path_blocking(p, NULL);
1587

1588 1589 1590 1591
				if (sret) {
					ret = sret;
					goto done;
				}
1592
				b = p->nodes[level];
1593 1594
				if (!b) {
					btrfs_release_path(NULL, p);
1595
					goto again;
1596
				}
1597
				slot = p->slots[level];
1598
				BUG_ON(btrfs_header_nritems(b) == 1);
C
Chris Mason 已提交
1599
			}
1600 1601
			unlock_up(p, level, lowest_unlock);

1602
			/* this is only true while dropping a snapshot */
1603
			if (level == lowest_level) {
1604 1605
				ret = 0;
				goto done;
1606
			}
1607

1608 1609 1610 1611 1612 1613
			blocknr = btrfs_node_blockptr(b, slot);
			gen = btrfs_node_ptr_generation(b, slot);
			blocksize = btrfs_level_size(root, level - 1);

			tmp = btrfs_find_tree_block(root, blocknr, blocksize);
			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1614 1615 1616 1617 1618 1619 1620
				b = tmp;
			} else {
				/*
				 * reduce lock contention at high levels
				 * of the btree by dropping locks before
				 * we read.
				 */
1621
				if (level > 0) {
1622
					btrfs_release_path(NULL, p);
1623 1624
					if (tmp)
						free_extent_buffer(tmp);
1625 1626 1627 1628 1629
					if (should_reada)
						reada_for_search(root, p,
								 level, slot,
								 key->objectid);

1630 1631
					tmp = read_tree_block(root, blocknr,
							 blocksize, gen);
1632 1633 1634 1635
					if (tmp)
						free_extent_buffer(tmp);
					goto again;
				} else {
1636
					btrfs_set_path_blocking(p);
1637 1638
					if (tmp)
						free_extent_buffer(tmp);
1639 1640 1641 1642
					if (should_reada)
						reada_for_search(root, p,
								 level, slot,
								 key->objectid);
1643 1644 1645
					b = read_node_slot(root, b, slot);
				}
			}
1646 1647 1648
			if (!p->skip_locking) {
				int lret;

1649
				btrfs_clear_path_blocking(p, NULL);
1650 1651 1652 1653 1654
				lret = btrfs_try_spin_lock(b);

				if (!lret) {
					btrfs_set_path_blocking(p);
					btrfs_tree_lock(b);
1655
					btrfs_clear_path_blocking(p, b);
1656 1657
				}
			}
1658 1659
		} else {
			p->slots[level] = slot;
1660 1661
			if (ins_len > 0 &&
			    btrfs_leaf_free_space(root, b) < ins_len) {
1662 1663 1664 1665
				int sret;

				btrfs_set_path_blocking(p);
				sret = split_leaf(trans, root, key,
1666
						      p, ins_len, ret == 0);
1667
				btrfs_clear_path_blocking(p, NULL);
1668

C
Chris Mason 已提交
1669
				BUG_ON(sret > 0);
1670 1671 1672 1673
				if (sret) {
					ret = sret;
					goto done;
				}
C
Chris Mason 已提交
1674
			}
1675 1676
			if (!p->search_for_split)
				unlock_up(p, level, lowest_unlock);
1677
			goto done;
1678 1679
		}
	}
1680 1681
	ret = 1;
done:
1682 1683 1684 1685 1686
	/*
	 * we don't really know what they plan on doing with the path
	 * from here on, so for now just mark it as blocking
	 */
	btrfs_set_path_blocking(p);
1687
	return ret;
1688 1689
}

Z
Zheng Yan 已提交
1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706
int btrfs_merge_path(struct btrfs_trans_handle *trans,
		     struct btrfs_root *root,
		     struct btrfs_key *node_keys,
		     u64 *nodes, int lowest_level)
{
	struct extent_buffer *eb;
	struct extent_buffer *parent;
	struct btrfs_key key;
	u64 bytenr;
	u64 generation;
	u32 blocksize;
	int level;
	int slot;
	int key_match;
	int ret;

	eb = btrfs_lock_root_node(root);
1707
	ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb);
Z
Zheng Yan 已提交
1708 1709
	BUG_ON(ret);

1710 1711
	btrfs_set_lock_blocking(eb);

Z
Zheng Yan 已提交
1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731
	parent = eb;
	while (1) {
		level = btrfs_header_level(parent);
		if (level == 0 || level <= lowest_level)
			break;

		ret = bin_search(parent, &node_keys[lowest_level], level,
				 &slot);
		if (ret && slot > 0)
			slot--;

		bytenr = btrfs_node_blockptr(parent, slot);
		if (nodes[level - 1] == bytenr)
			break;

		blocksize = btrfs_level_size(root, level - 1);
		generation = btrfs_node_ptr_generation(parent, slot);
		btrfs_node_key_to_cpu(eb, &key, slot);
		key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));

Y
Yan Zheng 已提交
1732 1733 1734 1735
		if (generation == trans->transid) {
			eb = read_tree_block(root, bytenr, blocksize,
					     generation);
			btrfs_tree_lock(eb);
1736
			btrfs_set_lock_blocking(eb);
Y
Yan Zheng 已提交
1737 1738
		}

Z
Zheng Yan 已提交
1739 1740 1741
		/*
		 * if node keys match and node pointer hasn't been modified
		 * in the running transaction, we can merge the path. for
Y
Yan Zheng 已提交
1742 1743 1744
		 * blocks owened by reloc trees, the node pointer check is
		 * skipped, this is because these blocks are fully controlled
		 * by the space balance code, no one else can modify them.
Z
Zheng Yan 已提交
1745 1746 1747
		 */
		if (!nodes[level - 1] || !key_match ||
		    (generation == trans->transid &&
Y
Yan Zheng 已提交
1748 1749 1750 1751 1752 1753
		     btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
			if (level == 1 || level == lowest_level + 1) {
				if (generation == trans->transid) {
					btrfs_tree_unlock(eb);
					free_extent_buffer(eb);
				}
Z
Zheng Yan 已提交
1754
				break;
Y
Yan Zheng 已提交
1755
			}
Z
Zheng Yan 已提交
1756

Y
Yan Zheng 已提交
1757 1758 1759 1760
			if (generation != trans->transid) {
				eb = read_tree_block(root, bytenr, blocksize,
						generation);
				btrfs_tree_lock(eb);
1761
				btrfs_set_lock_blocking(eb);
Y
Yan Zheng 已提交
1762
			}
Z
Zheng Yan 已提交
1763 1764

			ret = btrfs_cow_block(trans, root, eb, parent, slot,
1765
					      &eb);
Z
Zheng Yan 已提交
1766 1767
			BUG_ON(ret);

Y
Yan Zheng 已提交
1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778
			if (root->root_key.objectid ==
			    BTRFS_TREE_RELOC_OBJECTID) {
				if (!nodes[level - 1]) {
					nodes[level - 1] = eb->start;
					memcpy(&node_keys[level - 1], &key,
					       sizeof(node_keys[0]));
				} else {
					WARN_ON(1);
				}
			}

Z
Zheng Yan 已提交
1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793
			btrfs_tree_unlock(parent);
			free_extent_buffer(parent);
			parent = eb;
			continue;
		}

		btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
		btrfs_set_node_ptr_generation(parent, slot, trans->transid);
		btrfs_mark_buffer_dirty(parent);

		ret = btrfs_inc_extent_ref(trans, root,
					nodes[level - 1],
					blocksize, parent->start,
					btrfs_header_owner(parent),
					btrfs_header_generation(parent),
1794
					level - 1);
Z
Zheng Yan 已提交
1795 1796
		BUG_ON(ret);

Y
Yan Zheng 已提交
1797 1798 1799 1800 1801
		/*
		 * If the block was created in the running transaction,
		 * it's possible this is the last reference to it, so we
		 * should drop the subtree.
		 */
Z
Zheng Yan 已提交
1802
		if (generation == trans->transid) {
Y
Yan Zheng 已提交
1803 1804
			ret = btrfs_drop_subtree(trans, root, eb, parent);
			BUG_ON(ret);
Z
Zheng Yan 已提交
1805 1806
			btrfs_tree_unlock(eb);
			free_extent_buffer(eb);
Y
Yan Zheng 已提交
1807 1808 1809 1810 1811 1812 1813
		} else {
			ret = btrfs_free_extent(trans, root, bytenr,
					blocksize, parent->start,
					btrfs_header_owner(parent),
					btrfs_header_generation(parent),
					level - 1, 1);
			BUG_ON(ret);
Z
Zheng Yan 已提交
1814 1815 1816 1817 1818 1819 1820 1821
		}
		break;
	}
	btrfs_tree_unlock(parent);
	free_extent_buffer(parent);
	return 0;
}

C
Chris Mason 已提交
1822 1823 1824 1825 1826 1827
/*
 * adjust the pointers going up the tree, starting at level
 * making sure the right key of each node is points to 'key'.
 * This is used after shifting pointers to the left, so it stops
 * fixing up pointers when a given leaf/node is not in slot 0 of the
 * higher levels
C
Chris Mason 已提交
1828 1829 1830
 *
 * If this fails to write a tree block, it returns -1, but continues
 * fixing up the blocks in ram so the tree is consistent.
C
Chris Mason 已提交
1831
 */
1832 1833 1834
static int fixup_low_keys(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root, struct btrfs_path *path,
			  struct btrfs_disk_key *key, int level)
1835 1836
{
	int i;
C
Chris Mason 已提交
1837
	int ret = 0;
1838 1839
	struct extent_buffer *t;

C
Chris Mason 已提交
1840
	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1841
		int tslot = path->slots[i];
1842
		if (!path->nodes[i])
1843
			break;
1844 1845
		t = path->nodes[i];
		btrfs_set_node_key(t, key, tslot);
C
Chris Mason 已提交
1846
		btrfs_mark_buffer_dirty(path->nodes[i]);
1847 1848 1849
		if (tslot != 0)
			break;
	}
C
Chris Mason 已提交
1850
	return ret;
1851 1852
}

Z
Zheng Yan 已提交
1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887
/*
 * update item key.
 *
 * This function isn't completely safe. It's the caller's responsibility
 * that the new key won't break the order
 */
int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root, struct btrfs_path *path,
			    struct btrfs_key *new_key)
{
	struct btrfs_disk_key disk_key;
	struct extent_buffer *eb;
	int slot;

	eb = path->nodes[0];
	slot = path->slots[0];
	if (slot > 0) {
		btrfs_item_key(eb, &disk_key, slot - 1);
		if (comp_keys(&disk_key, new_key) >= 0)
			return -1;
	}
	if (slot < btrfs_header_nritems(eb) - 1) {
		btrfs_item_key(eb, &disk_key, slot + 1);
		if (comp_keys(&disk_key, new_key) <= 0)
			return -1;
	}

	btrfs_cpu_key_to_disk(&disk_key, new_key);
	btrfs_set_item_key(eb, &disk_key, slot);
	btrfs_mark_buffer_dirty(eb);
	if (slot == 0)
		fixup_low_keys(trans, root, path, &disk_key, 1);
	return 0;
}

C
Chris Mason 已提交
1888 1889
/*
 * try to push data from one node into the next node left in the
1890
 * tree.
C
Chris Mason 已提交
1891 1892 1893
 *
 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
 * error, and > 0 if there was no room in the left hand block.
C
Chris Mason 已提交
1894
 */
1895 1896
static int push_node_left(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root, struct extent_buffer *dst,
1897
			  struct extent_buffer *src, int empty)
1898 1899
{
	int push_items = 0;
1900 1901
	int src_nritems;
	int dst_nritems;
C
Chris Mason 已提交
1902
	int ret = 0;
1903

1904 1905
	src_nritems = btrfs_header_nritems(src);
	dst_nritems = btrfs_header_nritems(dst);
C
Chris Mason 已提交
1906
	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1907 1908
	WARN_ON(btrfs_header_generation(src) != trans->transid);
	WARN_ON(btrfs_header_generation(dst) != trans->transid);
1909

1910
	if (!empty && src_nritems <= 8)
1911 1912
		return 1;

C
Chris Mason 已提交
1913
	if (push_items <= 0)
1914 1915
		return 1;

1916
	if (empty) {
1917
		push_items = min(src_nritems, push_items);
1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929
		if (push_items < src_nritems) {
			/* leave at least 8 pointers in the node if
			 * we aren't going to empty it
			 */
			if (src_nritems - push_items < 8) {
				if (push_items <= 8)
					return 1;
				push_items -= 8;
			}
		}
	} else
		push_items = min(src_nritems - 8, push_items);
1930

1931 1932 1933
	copy_extent_buffer(dst, src,
			   btrfs_node_key_ptr_offset(dst_nritems),
			   btrfs_node_key_ptr_offset(0),
C
Chris Mason 已提交
1934
			   push_items * sizeof(struct btrfs_key_ptr));
1935

1936
	if (push_items < src_nritems) {
1937 1938 1939 1940 1941 1942 1943 1944 1945
		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
				      btrfs_node_key_ptr_offset(push_items),
				      (src_nritems - push_items) *
				      sizeof(struct btrfs_key_ptr));
	}
	btrfs_set_header_nritems(src, src_nritems - push_items);
	btrfs_set_header_nritems(dst, dst_nritems + push_items);
	btrfs_mark_buffer_dirty(src);
	btrfs_mark_buffer_dirty(dst);
Z
Zheng Yan 已提交
1946 1947 1948 1949

	ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
	BUG_ON(ret);

1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961
	return ret;
}

/*
 * try to push data from one node into the next node right in the
 * tree.
 *
 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
 * error, and > 0 if there was no room in the right hand block.
 *
 * this will  only push up to 1/2 the contents of the left node over
 */
1962 1963 1964 1965
static int balance_node_right(struct btrfs_trans_handle *trans,
			      struct btrfs_root *root,
			      struct extent_buffer *dst,
			      struct extent_buffer *src)
1966 1967 1968 1969 1970 1971 1972
{
	int push_items = 0;
	int max_push;
	int src_nritems;
	int dst_nritems;
	int ret = 0;

1973 1974 1975
	WARN_ON(btrfs_header_generation(src) != trans->transid);
	WARN_ON(btrfs_header_generation(dst) != trans->transid);

1976 1977
	src_nritems = btrfs_header_nritems(src);
	dst_nritems = btrfs_header_nritems(dst);
C
Chris Mason 已提交
1978
	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
C
Chris Mason 已提交
1979
	if (push_items <= 0)
1980
		return 1;
1981

C
Chris Mason 已提交
1982
	if (src_nritems < 4)
1983
		return 1;
1984 1985 1986

	max_push = src_nritems / 2 + 1;
	/* don't try to empty the node */
C
Chris Mason 已提交
1987
	if (max_push >= src_nritems)
1988
		return 1;
Y
Yan 已提交
1989

1990 1991 1992
	if (max_push < push_items)
		push_items = max_push;

1993 1994 1995 1996
	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
				      btrfs_node_key_ptr_offset(0),
				      (dst_nritems) *
				      sizeof(struct btrfs_key_ptr));
C
Chris Mason 已提交
1997

1998 1999 2000
	copy_extent_buffer(dst, src,
			   btrfs_node_key_ptr_offset(0),
			   btrfs_node_key_ptr_offset(src_nritems - push_items),
C
Chris Mason 已提交
2001
			   push_items * sizeof(struct btrfs_key_ptr));
2002

2003 2004
	btrfs_set_header_nritems(src, src_nritems - push_items);
	btrfs_set_header_nritems(dst, dst_nritems + push_items);
2005

2006 2007
	btrfs_mark_buffer_dirty(src);
	btrfs_mark_buffer_dirty(dst);
Z
Zheng Yan 已提交
2008 2009 2010 2011

	ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
	BUG_ON(ret);

C
Chris Mason 已提交
2012
	return ret;
2013 2014
}

C
Chris Mason 已提交
2015 2016 2017 2018
/*
 * helper function to insert a new root level in the tree.
 * A new node is allocated, and a single item is inserted to
 * point to the existing root
C
Chris Mason 已提交
2019 2020
 *
 * returns zero on success or < 0 on failure.
C
Chris Mason 已提交
2021
 */
C
Chris Mason 已提交
2022
static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2023 2024
			   struct btrfs_root *root,
			   struct btrfs_path *path, int level)
C
Chris Mason 已提交
2025
{
2026
	u64 lower_gen;
2027 2028
	struct extent_buffer *lower;
	struct extent_buffer *c;
2029
	struct extent_buffer *old;
2030
	struct btrfs_disk_key lower_key;
Z
Zheng Yan 已提交
2031
	int ret;
C
Chris Mason 已提交
2032 2033 2034 2035

	BUG_ON(path->nodes[level]);
	BUG_ON(path->nodes[level-1] != root->node);

2036 2037 2038 2039 2040 2041
	lower = path->nodes[level-1];
	if (level == 1)
		btrfs_item_key(lower, &lower_key, 0);
	else
		btrfs_node_key(lower, &lower_key, 0);

Z
Zheng Yan 已提交
2042 2043
	c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
				   root->root_key.objectid, trans->transid,
2044
				   level, root->node->start, 0);
2045 2046
	if (IS_ERR(c))
		return PTR_ERR(c);
2047

2048 2049 2050
	memset_extent_buffer(c, 0, 0, root->nodesize);
	btrfs_set_header_nritems(c, 1);
	btrfs_set_header_level(c, level);
2051
	btrfs_set_header_bytenr(c, c->start);
2052 2053 2054 2055 2056 2057
	btrfs_set_header_generation(c, trans->transid);
	btrfs_set_header_owner(c, root->root_key.objectid);

	write_extent_buffer(c, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(c),
			    BTRFS_FSID_SIZE);
2058 2059 2060 2061 2062

	write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
			    (unsigned long)btrfs_header_chunk_tree_uuid(c),
			    BTRFS_UUID_SIZE);

2063
	btrfs_set_node_key(c, &lower_key, 0);
2064
	btrfs_set_node_blockptr(c, 0, lower->start);
2065
	lower_gen = btrfs_header_generation(lower);
Z
Zheng Yan 已提交
2066
	WARN_ON(lower_gen != trans->transid);
2067 2068

	btrfs_set_node_ptr_generation(c, 0, lower_gen);
2069

2070
	btrfs_mark_buffer_dirty(c);
2071

2072 2073
	spin_lock(&root->node_lock);
	old = root->node;
2074
	root->node = c;
2075 2076
	spin_unlock(&root->node_lock);

Z
Zheng Yan 已提交
2077 2078 2079
	ret = btrfs_update_extent_ref(trans, root, lower->start,
				      lower->start, c->start,
				      root->root_key.objectid,
2080
				      trans->transid, level - 1);
Z
Zheng Yan 已提交
2081 2082
	BUG_ON(ret);

2083 2084 2085
	/* the super has an extra ref to root->node */
	free_extent_buffer(old);

2086
	add_root_to_dirty_list(root);
2087 2088
	extent_buffer_get(c);
	path->nodes[level] = c;
2089
	path->locks[level] = 1;
C
Chris Mason 已提交
2090 2091 2092 2093
	path->slots[level] = 0;
	return 0;
}

C
Chris Mason 已提交
2094 2095 2096
/*
 * worker function to insert a single pointer in a node.
 * the node should have enough room for the pointer already
C
Chris Mason 已提交
2097
 *
C
Chris Mason 已提交
2098 2099
 * slot and level indicate where you want the key to go, and
 * blocknr is the block the key points to.
C
Chris Mason 已提交
2100 2101
 *
 * returns zero on success and < 0 on any error
C
Chris Mason 已提交
2102
 */
2103 2104
static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
		      *root, struct btrfs_path *path, struct btrfs_disk_key
2105
		      *key, u64 bytenr, int slot, int level)
C
Chris Mason 已提交
2106
{
2107
	struct extent_buffer *lower;
C
Chris Mason 已提交
2108
	int nritems;
C
Chris Mason 已提交
2109 2110

	BUG_ON(!path->nodes[level]);
2111 2112
	lower = path->nodes[level];
	nritems = btrfs_header_nritems(lower);
C
Chris Mason 已提交
2113 2114
	if (slot > nritems)
		BUG();
C
Chris Mason 已提交
2115
	if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
C
Chris Mason 已提交
2116 2117
		BUG();
	if (slot != nritems) {
2118 2119 2120
		memmove_extent_buffer(lower,
			      btrfs_node_key_ptr_offset(slot + 1),
			      btrfs_node_key_ptr_offset(slot),
C
Chris Mason 已提交
2121
			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
C
Chris Mason 已提交
2122
	}
2123
	btrfs_set_node_key(lower, key, slot);
2124
	btrfs_set_node_blockptr(lower, slot, bytenr);
2125 2126
	WARN_ON(trans->transid == 0);
	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2127 2128
	btrfs_set_header_nritems(lower, nritems + 1);
	btrfs_mark_buffer_dirty(lower);
C
Chris Mason 已提交
2129 2130 2131
	return 0;
}

C
Chris Mason 已提交
2132 2133 2134 2135 2136 2137
/*
 * split the node at the specified level in path in two.
 * The path is corrected to point to the appropriate node after the split
 *
 * Before splitting this tries to make some room in the node by pushing
 * left and right, if either one works, it returns right away.
C
Chris Mason 已提交
2138 2139
 *
 * returns 0 on success and < 0 on failure
C
Chris Mason 已提交
2140
 */
2141 2142 2143
static noinline int split_node(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root,
			       struct btrfs_path *path, int level)
2144
{
2145 2146 2147
	struct extent_buffer *c;
	struct extent_buffer *split;
	struct btrfs_disk_key disk_key;
2148
	int mid;
C
Chris Mason 已提交
2149
	int ret;
C
Chris Mason 已提交
2150
	int wret;
2151
	u32 c_nritems;
2152

2153
	c = path->nodes[level];
2154
	WARN_ON(btrfs_header_generation(c) != trans->transid);
2155
	if (c == root->node) {
C
Chris Mason 已提交
2156
		/* trying to split the root, lets make a new one */
2157
		ret = insert_new_root(trans, root, path, level + 1);
C
Chris Mason 已提交
2158 2159
		if (ret)
			return ret;
2160 2161
	} else {
		ret = push_nodes_for_insert(trans, root, path, level);
2162 2163
		c = path->nodes[level];
		if (!ret && btrfs_header_nritems(c) <
2164
		    BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
2165
			return 0;
2166 2167
		if (ret < 0)
			return ret;
2168
	}
2169

2170
	c_nritems = btrfs_header_nritems(c);
2171

2172
	split = btrfs_alloc_free_block(trans, root, root->nodesize,
Z
Zheng Yan 已提交
2173 2174 2175
					path->nodes[level + 1]->start,
					root->root_key.objectid,
					trans->transid, level, c->start, 0);
2176 2177 2178 2179 2180
	if (IS_ERR(split))
		return PTR_ERR(split);

	btrfs_set_header_flags(split, btrfs_header_flags(c));
	btrfs_set_header_level(split, btrfs_header_level(c));
2181
	btrfs_set_header_bytenr(split, split->start);
2182 2183
	btrfs_set_header_generation(split, trans->transid);
	btrfs_set_header_owner(split, root->root_key.objectid);
2184
	btrfs_set_header_flags(split, 0);
2185 2186 2187
	write_extent_buffer(split, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(split),
			    BTRFS_FSID_SIZE);
2188 2189 2190
	write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
			    (unsigned long)btrfs_header_chunk_tree_uuid(split),
			    BTRFS_UUID_SIZE);
2191

2192
	mid = (c_nritems + 1) / 2;
2193 2194 2195 2196 2197 2198 2199

	copy_extent_buffer(split, c,
			   btrfs_node_key_ptr_offset(0),
			   btrfs_node_key_ptr_offset(mid),
			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
	btrfs_set_header_nritems(split, c_nritems - mid);
	btrfs_set_header_nritems(c, mid);
C
Chris Mason 已提交
2200 2201
	ret = 0;

2202 2203 2204 2205
	btrfs_mark_buffer_dirty(c);
	btrfs_mark_buffer_dirty(split);

	btrfs_node_key(split, &disk_key, 0);
2206
	wret = insert_ptr(trans, root, path, &disk_key, split->start,
2207
			  path->slots[level + 1] + 1,
C
Chris Mason 已提交
2208
			  level + 1);
C
Chris Mason 已提交
2209 2210 2211
	if (wret)
		ret = wret;

Z
Zheng Yan 已提交
2212 2213 2214
	ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
	BUG_ON(ret);

C
Chris Mason 已提交
2215
	if (path->slots[level] >= mid) {
C
Chris Mason 已提交
2216
		path->slots[level] -= mid;
2217
		btrfs_tree_unlock(c);
2218 2219
		free_extent_buffer(c);
		path->nodes[level] = split;
C
Chris Mason 已提交
2220 2221
		path->slots[level + 1] += 1;
	} else {
2222
		btrfs_tree_unlock(split);
2223
		free_extent_buffer(split);
2224
	}
C
Chris Mason 已提交
2225
	return ret;
2226 2227
}

C
Chris Mason 已提交
2228 2229 2230 2231 2232
/*
 * how many bytes are required to store the items in a leaf.  start
 * and nr indicate which items in the leaf to check.  This totals up the
 * space used both by the item structs and the item data
 */
2233
static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2234 2235
{
	int data_len;
2236
	int nritems = btrfs_header_nritems(l);
2237
	int end = min(nritems, start + nr) - 1;
2238 2239 2240

	if (!nr)
		return 0;
2241 2242
	data_len = btrfs_item_end_nr(l, start);
	data_len = data_len - btrfs_item_offset_nr(l, end);
C
Chris Mason 已提交
2243
	data_len += sizeof(struct btrfs_item) * nr;
2244
	WARN_ON(data_len < 0);
2245 2246 2247
	return data_len;
}

2248 2249 2250 2251 2252
/*
 * The space between the end of the leaf items and
 * the start of the leaf data.  IOW, how much room
 * the leaf has left for both items and data
 */
C
Chris Mason 已提交
2253
noinline int btrfs_leaf_free_space(struct btrfs_root *root,
2254
				   struct extent_buffer *leaf)
2255
{
2256 2257 2258 2259
	int nritems = btrfs_header_nritems(leaf);
	int ret;
	ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
	if (ret < 0) {
C
Chris Mason 已提交
2260 2261
		printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
		       "used %d nritems %d\n",
J
Jens Axboe 已提交
2262
		       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
2263 2264 2265
		       leaf_space_used(leaf, 0, nritems), nritems);
	}
	return ret;
2266 2267
}

C
Chris Mason 已提交
2268 2269 2270
/*
 * push some data in the path leaf to the right, trying to free up at
 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
C
Chris Mason 已提交
2271 2272 2273
 *
 * returns 1 if the push failed because the other node didn't have enough
 * room, 0 if everything worked out and < 0 if there were major errors.
C
Chris Mason 已提交
2274
 */
2275
static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2276 2277
			   *root, struct btrfs_path *path, int data_size,
			   int empty)
C
Chris Mason 已提交
2278
{
2279 2280 2281 2282
	struct extent_buffer *left = path->nodes[0];
	struct extent_buffer *right;
	struct extent_buffer *upper;
	struct btrfs_disk_key disk_key;
C
Chris Mason 已提交
2283
	int slot;
2284
	u32 i;
C
Chris Mason 已提交
2285 2286 2287
	int free_space;
	int push_space = 0;
	int push_items = 0;
C
Chris Mason 已提交
2288
	struct btrfs_item *item;
2289
	u32 left_nritems;
2290
	u32 nr;
2291
	u32 right_nritems;
2292
	u32 data_end;
2293
	u32 this_item_size;
2294
	int ret;
C
Chris Mason 已提交
2295 2296

	slot = path->slots[1];
C
Chris Mason 已提交
2297
	if (!path->nodes[1])
C
Chris Mason 已提交
2298
		return 1;
C
Chris Mason 已提交
2299

C
Chris Mason 已提交
2300
	upper = path->nodes[1];
2301
	if (slot >= btrfs_header_nritems(upper) - 1)
C
Chris Mason 已提交
2302
		return 1;
2303

2304
	btrfs_assert_tree_locked(path->nodes[1]);
2305

2306
	right = read_node_slot(root, upper, slot + 1);
2307
	btrfs_tree_lock(right);
2308 2309
	btrfs_set_lock_blocking(right);

C
Chris Mason 已提交
2310
	free_space = btrfs_leaf_free_space(root, right);
2311
	if (free_space < data_size)
2312
		goto out_unlock;
2313

C
Chris Mason 已提交
2314
	/* cow and double check */
2315
	ret = btrfs_cow_block(trans, root, right, upper,
2316
			      slot + 1, &right);
2317 2318 2319
	if (ret)
		goto out_unlock;

C
Chris Mason 已提交
2320
	free_space = btrfs_leaf_free_space(root, right);
2321
	if (free_space < data_size)
2322
		goto out_unlock;
C
Chris Mason 已提交
2323

2324
	left_nritems = btrfs_header_nritems(left);
2325 2326
	if (left_nritems == 0)
		goto out_unlock;
2327

2328 2329 2330 2331 2332
	if (empty)
		nr = 0;
	else
		nr = 1;

Z
Zheng Yan 已提交
2333
	if (path->slots[0] >= left_nritems)
2334
		push_space += data_size;
Z
Zheng Yan 已提交
2335

2336 2337
	i = left_nritems - 1;
	while (i >= nr) {
2338
		item = btrfs_item_nr(left, i);
2339

Z
Zheng Yan 已提交
2340 2341 2342 2343 2344 2345 2346 2347 2348 2349
		if (!empty && push_items > 0) {
			if (path->slots[0] > i)
				break;
			if (path->slots[0] == i) {
				int space = btrfs_leaf_free_space(root, left);
				if (space + push_space * 2 > free_space)
					break;
			}
		}

C
Chris Mason 已提交
2350
		if (path->slots[0] == i)
2351
			push_space += data_size;
2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362

		if (!left->map_token) {
			map_extent_buffer(left, (unsigned long)item,
					sizeof(struct btrfs_item),
					&left->map_token, &left->kaddr,
					&left->map_start, &left->map_len,
					KM_USER1);
		}

		this_item_size = btrfs_item_size(left, item);
		if (this_item_size + sizeof(*item) + push_space > free_space)
C
Chris Mason 已提交
2363
			break;
Z
Zheng Yan 已提交
2364

C
Chris Mason 已提交
2365
		push_items++;
2366
		push_space += this_item_size + sizeof(*item);
2367 2368 2369
		if (i == 0)
			break;
		i--;
2370 2371 2372 2373
	}
	if (left->map_token) {
		unmap_extent_buffer(left, left->map_token, KM_USER1);
		left->map_token = NULL;
C
Chris Mason 已提交
2374
	}
2375

2376 2377
	if (push_items == 0)
		goto out_unlock;
2378

2379
	if (!empty && push_items == left_nritems)
2380
		WARN_ON(1);
2381

C
Chris Mason 已提交
2382
	/* push left to right */
2383
	right_nritems = btrfs_header_nritems(right);
2384

2385
	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
C
Chris Mason 已提交
2386
	push_space -= leaf_data_end(root, left);
2387

C
Chris Mason 已提交
2388
	/* make room in the right data area */
2389 2390 2391 2392 2393 2394
	data_end = leaf_data_end(root, right);
	memmove_extent_buffer(right,
			      btrfs_leaf_data(right) + data_end - push_space,
			      btrfs_leaf_data(right) + data_end,
			      BTRFS_LEAF_DATA_SIZE(root) - data_end);

C
Chris Mason 已提交
2395
	/* copy from the left data area */
2396
	copy_extent_buffer(right, left, btrfs_leaf_data(right) +
C
Chris Mason 已提交
2397 2398 2399
		     BTRFS_LEAF_DATA_SIZE(root) - push_space,
		     btrfs_leaf_data(left) + leaf_data_end(root, left),
		     push_space);
2400 2401 2402 2403 2404

	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
			      btrfs_item_nr_offset(0),
			      right_nritems * sizeof(struct btrfs_item));

C
Chris Mason 已提交
2405
	/* copy the items from left to right */
2406 2407 2408
	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
		   btrfs_item_nr_offset(left_nritems - push_items),
		   push_items * sizeof(struct btrfs_item));
C
Chris Mason 已提交
2409 2410

	/* update the item pointers */
2411
	right_nritems += push_items;
2412
	btrfs_set_header_nritems(right, right_nritems);
C
Chris Mason 已提交
2413
	push_space = BTRFS_LEAF_DATA_SIZE(root);
2414
	for (i = 0; i < right_nritems; i++) {
2415
		item = btrfs_item_nr(right, i);
2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429
		if (!right->map_token) {
			map_extent_buffer(right, (unsigned long)item,
					sizeof(struct btrfs_item),
					&right->map_token, &right->kaddr,
					&right->map_start, &right->map_len,
					KM_USER1);
		}
		push_space -= btrfs_item_size(right, item);
		btrfs_set_item_offset(right, item, push_space);
	}

	if (right->map_token) {
		unmap_extent_buffer(right, right->map_token, KM_USER1);
		right->map_token = NULL;
C
Chris Mason 已提交
2430
	}
2431
	left_nritems -= push_items;
2432
	btrfs_set_header_nritems(left, left_nritems);
C
Chris Mason 已提交
2433

2434 2435
	if (left_nritems)
		btrfs_mark_buffer_dirty(left);
2436
	btrfs_mark_buffer_dirty(right);
2437

Z
Zheng Yan 已提交
2438 2439 2440
	ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
	BUG_ON(ret);

2441 2442
	btrfs_item_key(right, &disk_key, 0);
	btrfs_set_node_key(upper, &disk_key, slot + 1);
C
Chris Mason 已提交
2443
	btrfs_mark_buffer_dirty(upper);
C
Chris Mason 已提交
2444

C
Chris Mason 已提交
2445
	/* then fixup the leaf pointer in the path */
2446 2447
	if (path->slots[0] >= left_nritems) {
		path->slots[0] -= left_nritems;
2448 2449 2450
		if (btrfs_header_nritems(path->nodes[0]) == 0)
			clean_tree_block(trans, root, path->nodes[0]);
		btrfs_tree_unlock(path->nodes[0]);
2451 2452
		free_extent_buffer(path->nodes[0]);
		path->nodes[0] = right;
C
Chris Mason 已提交
2453 2454
		path->slots[1] += 1;
	} else {
2455
		btrfs_tree_unlock(right);
2456
		free_extent_buffer(right);
C
Chris Mason 已提交
2457 2458
	}
	return 0;
2459 2460 2461 2462 2463

out_unlock:
	btrfs_tree_unlock(right);
	free_extent_buffer(right);
	return 1;
C
Chris Mason 已提交
2464
}
2465

C
Chris Mason 已提交
2466 2467 2468 2469
/*
 * push some data in the path leaf to the left, trying to free up at
 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
 */
2470
static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2471 2472
			  *root, struct btrfs_path *path, int data_size,
			  int empty)
2473
{
2474 2475 2476
	struct btrfs_disk_key disk_key;
	struct extent_buffer *right = path->nodes[0];
	struct extent_buffer *left;
2477 2478 2479 2480 2481
	int slot;
	int i;
	int free_space;
	int push_space = 0;
	int push_items = 0;
C
Chris Mason 已提交
2482
	struct btrfs_item *item;
2483
	u32 old_left_nritems;
2484
	u32 right_nritems;
2485
	u32 nr;
C
Chris Mason 已提交
2486 2487
	int ret = 0;
	int wret;
2488 2489
	u32 this_item_size;
	u32 old_left_item_size;
2490 2491

	slot = path->slots[1];
2492
	if (slot == 0)
2493
		return 1;
2494
	if (!path->nodes[1])
2495
		return 1;
2496

2497
	right_nritems = btrfs_header_nritems(right);
C
Chris Mason 已提交
2498
	if (right_nritems == 0)
2499 2500
		return 1;

2501
	btrfs_assert_tree_locked(path->nodes[1]);
2502

2503
	left = read_node_slot(root, path->nodes[1], slot - 1);
2504
	btrfs_tree_lock(left);
2505 2506
	btrfs_set_lock_blocking(left);

C
Chris Mason 已提交
2507
	free_space = btrfs_leaf_free_space(root, left);
2508
	if (free_space < data_size) {
2509 2510
		ret = 1;
		goto out;
2511
	}
C
Chris Mason 已提交
2512 2513

	/* cow and double check */
2514
	ret = btrfs_cow_block(trans, root, left,
2515
			      path->nodes[1], slot - 1, &left);
2516 2517
	if (ret) {
		/* we hit -ENOSPC, but it isn't fatal here */
2518 2519
		ret = 1;
		goto out;
2520
	}
2521

C
Chris Mason 已提交
2522
	free_space = btrfs_leaf_free_space(root, left);
2523
	if (free_space < data_size) {
2524 2525
		ret = 1;
		goto out;
C
Chris Mason 已提交
2526 2527
	}

2528 2529 2530 2531 2532 2533
	if (empty)
		nr = right_nritems;
	else
		nr = right_nritems - 1;

	for (i = 0; i < nr; i++) {
2534
		item = btrfs_item_nr(right, i);
2535 2536 2537 2538 2539 2540 2541 2542
		if (!right->map_token) {
			map_extent_buffer(right, (unsigned long)item,
					sizeof(struct btrfs_item),
					&right->map_token, &right->kaddr,
					&right->map_start, &right->map_len,
					KM_USER1);
		}

Z
Zheng Yan 已提交
2543 2544 2545 2546 2547 2548 2549 2550 2551 2552
		if (!empty && push_items > 0) {
			if (path->slots[0] < i)
				break;
			if (path->slots[0] == i) {
				int space = btrfs_leaf_free_space(root, right);
				if (space + push_space * 2 > free_space)
					break;
			}
		}

2553
		if (path->slots[0] == i)
2554
			push_space += data_size;
2555 2556 2557

		this_item_size = btrfs_item_size(right, item);
		if (this_item_size + sizeof(*item) + push_space > free_space)
2558
			break;
2559

2560
		push_items++;
2561 2562 2563 2564 2565 2566
		push_space += this_item_size + sizeof(*item);
	}

	if (right->map_token) {
		unmap_extent_buffer(right, right->map_token, KM_USER1);
		right->map_token = NULL;
2567
	}
2568

2569
	if (push_items == 0) {
2570 2571
		ret = 1;
		goto out;
2572
	}
2573
	if (!empty && push_items == btrfs_header_nritems(right))
2574
		WARN_ON(1);
2575

2576
	/* push data from right to left */
2577 2578 2579 2580 2581
	copy_extent_buffer(left, right,
			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
			   btrfs_item_nr_offset(0),
			   push_items * sizeof(struct btrfs_item));

C
Chris Mason 已提交
2582
	push_space = BTRFS_LEAF_DATA_SIZE(root) -
C
Chris Mason 已提交
2583
		     btrfs_item_offset_nr(right, push_items - 1);
2584 2585

	copy_extent_buffer(left, right, btrfs_leaf_data(left) +
C
Chris Mason 已提交
2586 2587
		     leaf_data_end(root, left) - push_space,
		     btrfs_leaf_data(right) +
2588
		     btrfs_item_offset_nr(right, push_items - 1),
C
Chris Mason 已提交
2589
		     push_space);
2590
	old_left_nritems = btrfs_header_nritems(left);
2591
	BUG_ON(old_left_nritems <= 0);
2592

2593
	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
C
Chris Mason 已提交
2594
	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2595
		u32 ioff;
2596

2597
		item = btrfs_item_nr(left, i);
2598 2599 2600 2601 2602 2603 2604 2605
		if (!left->map_token) {
			map_extent_buffer(left, (unsigned long)item,
					sizeof(struct btrfs_item),
					&left->map_token, &left->kaddr,
					&left->map_start, &left->map_len,
					KM_USER1);
		}

2606 2607
		ioff = btrfs_item_offset(left, item);
		btrfs_set_item_offset(left, item,
2608
		      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
2609
	}
2610
	btrfs_set_header_nritems(left, old_left_nritems + push_items);
2611 2612 2613 2614
	if (left->map_token) {
		unmap_extent_buffer(left, left->map_token, KM_USER1);
		left->map_token = NULL;
	}
2615 2616

	/* fixup right node */
2617
	if (push_items > right_nritems) {
C
Chris Mason 已提交
2618 2619
		printk(KERN_CRIT "push items %d nr %u\n", push_items,
		       right_nritems);
2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631
		WARN_ON(1);
	}

	if (push_items < right_nritems) {
		push_space = btrfs_item_offset_nr(right, push_items - 1) -
						  leaf_data_end(root, right);
		memmove_extent_buffer(right, btrfs_leaf_data(right) +
				      BTRFS_LEAF_DATA_SIZE(root) - push_space,
				      btrfs_leaf_data(right) +
				      leaf_data_end(root, right), push_space);

		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
2632 2633 2634
			      btrfs_item_nr_offset(push_items),
			     (btrfs_header_nritems(right) - push_items) *
			     sizeof(struct btrfs_item));
2635
	}
2636 2637
	right_nritems -= push_items;
	btrfs_set_header_nritems(right, right_nritems);
C
Chris Mason 已提交
2638
	push_space = BTRFS_LEAF_DATA_SIZE(root);
2639 2640
	for (i = 0; i < right_nritems; i++) {
		item = btrfs_item_nr(right, i);
2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655

		if (!right->map_token) {
			map_extent_buffer(right, (unsigned long)item,
					sizeof(struct btrfs_item),
					&right->map_token, &right->kaddr,
					&right->map_start, &right->map_len,
					KM_USER1);
		}

		push_space = push_space - btrfs_item_size(right, item);
		btrfs_set_item_offset(right, item, push_space);
	}
	if (right->map_token) {
		unmap_extent_buffer(right, right->map_token, KM_USER1);
		right->map_token = NULL;
2656
	}
2657

2658
	btrfs_mark_buffer_dirty(left);
2659 2660
	if (right_nritems)
		btrfs_mark_buffer_dirty(right);
2661

Z
Zheng Yan 已提交
2662 2663 2664 2665
	ret = btrfs_update_ref(trans, root, right, left,
			       old_left_nritems, push_items);
	BUG_ON(ret);

2666 2667
	btrfs_item_key(right, &disk_key, 0);
	wret = fixup_low_keys(trans, root, path, &disk_key, 1);
C
Chris Mason 已提交
2668 2669
	if (wret)
		ret = wret;
2670 2671 2672 2673

	/* then fixup the leaf pointer in the path */
	if (path->slots[0] < push_items) {
		path->slots[0] += old_left_nritems;
2674 2675 2676
		if (btrfs_header_nritems(path->nodes[0]) == 0)
			clean_tree_block(trans, root, path->nodes[0]);
		btrfs_tree_unlock(path->nodes[0]);
2677 2678
		free_extent_buffer(path->nodes[0]);
		path->nodes[0] = left;
2679 2680
		path->slots[1] -= 1;
	} else {
2681
		btrfs_tree_unlock(left);
2682
		free_extent_buffer(left);
2683 2684
		path->slots[0] -= push_items;
	}
2685
	BUG_ON(path->slots[0] < 0);
C
Chris Mason 已提交
2686
	return ret;
2687 2688 2689 2690
out:
	btrfs_tree_unlock(left);
	free_extent_buffer(left);
	return ret;
2691 2692
}

C
Chris Mason 已提交
2693 2694 2695
/*
 * split the path's leaf in two, making sure there is at least data_size
 * available for the resulting leaf level of the path.
C
Chris Mason 已提交
2696 2697
 *
 * returns 0 if all went well and < 0 on failure.
C
Chris Mason 已提交
2698
 */
2699 2700 2701 2702 2703
static noinline int split_leaf(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root,
			       struct btrfs_key *ins_key,
			       struct btrfs_path *path, int data_size,
			       int extend)
2704
{
2705
	struct extent_buffer *l;
2706
	u32 nritems;
2707 2708
	int mid;
	int slot;
2709
	struct extent_buffer *right;
2710 2711 2712
	int data_copy_size;
	int rt_data_off;
	int i;
2713
	int ret = 0;
C
Chris Mason 已提交
2714
	int wret;
2715 2716
	int double_split;
	int num_doubles = 0;
2717
	struct btrfs_disk_key disk_key;
C
Chris Mason 已提交
2718

C
Chris Mason 已提交
2719
	/* first try to make some room by pushing left and right */
2720
	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
2721
		wret = push_leaf_right(trans, root, path, data_size, 0);
C
Chris Mason 已提交
2722
		if (wret < 0)
C
Chris Mason 已提交
2723
			return wret;
2724
		if (wret) {
2725
			wret = push_leaf_left(trans, root, path, data_size, 0);
2726 2727 2728 2729
			if (wret < 0)
				return wret;
		}
		l = path->nodes[0];
C
Chris Mason 已提交
2730

2731
		/* did the pushes work? */
2732
		if (btrfs_leaf_free_space(root, l) >= data_size)
2733
			return 0;
2734
	}
C
Chris Mason 已提交
2735

C
Chris Mason 已提交
2736
	if (!path->nodes[1]) {
2737
		ret = insert_new_root(trans, root, path, 1);
C
Chris Mason 已提交
2738 2739 2740
		if (ret)
			return ret;
	}
2741 2742 2743
again:
	double_split = 0;
	l = path->nodes[0];
2744
	slot = path->slots[0];
2745
	nritems = btrfs_header_nritems(l);
C
Chris Mason 已提交
2746
	mid = (nritems + 1) / 2;
2747

2748
	right = btrfs_alloc_free_block(trans, root, root->leafsize,
Z
Zheng Yan 已提交
2749 2750 2751
					path->nodes[1]->start,
					root->root_key.objectid,
					trans->transid, 0, l->start, 0);
2752 2753
	if (IS_ERR(right)) {
		BUG_ON(1);
2754
		return PTR_ERR(right);
2755
	}
2756 2757

	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2758
	btrfs_set_header_bytenr(right, right->start);
2759 2760 2761 2762 2763 2764
	btrfs_set_header_generation(right, trans->transid);
	btrfs_set_header_owner(right, root->root_key.objectid);
	btrfs_set_header_level(right, 0);
	write_extent_buffer(right, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(right),
			    BTRFS_FSID_SIZE);
2765 2766 2767 2768

	write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
			    (unsigned long)btrfs_header_chunk_tree_uuid(right),
			    BTRFS_UUID_SIZE);
2769 2770
	if (mid <= slot) {
		if (nritems == 1 ||
2771
		    leaf_space_used(l, mid, nritems - mid) + data_size >
2772 2773 2774
			BTRFS_LEAF_DATA_SIZE(root)) {
			if (slot >= nritems) {
				btrfs_cpu_key_to_disk(&disk_key, ins_key);
2775
				btrfs_set_header_nritems(right, 0);
2776
				wret = insert_ptr(trans, root, path,
2777
						  &disk_key, right->start,
2778 2779 2780
						  path->slots[1] + 1, 1);
				if (wret)
					ret = wret;
2781 2782

				btrfs_tree_unlock(path->nodes[0]);
2783 2784
				free_extent_buffer(path->nodes[0]);
				path->nodes[0] = right;
2785 2786
				path->slots[0] = 0;
				path->slots[1] += 1;
2787
				btrfs_mark_buffer_dirty(right);
2788 2789 2790
				return ret;
			}
			mid = slot;
2791 2792
			if (mid != nritems &&
			    leaf_space_used(l, mid, nritems - mid) +
2793
			    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2794 2795
				double_split = 1;
			}
2796 2797
		}
	} else {
2798
		if (leaf_space_used(l, 0, mid) + data_size >
2799
			BTRFS_LEAF_DATA_SIZE(root)) {
2800
			if (!extend && data_size && slot == 0) {
2801
				btrfs_cpu_key_to_disk(&disk_key, ins_key);
2802
				btrfs_set_header_nritems(right, 0);
2803 2804
				wret = insert_ptr(trans, root, path,
						  &disk_key,
2805
						  right->start,
2806
						  path->slots[1], 1);
2807 2808
				if (wret)
					ret = wret;
2809
				btrfs_tree_unlock(path->nodes[0]);
2810 2811
				free_extent_buffer(path->nodes[0]);
				path->nodes[0] = right;
2812
				path->slots[0] = 0;
2813 2814
				if (path->slots[1] == 0) {
					wret = fixup_low_keys(trans, root,
C
Chris Mason 已提交
2815
						      path, &disk_key, 1);
2816 2817 2818
					if (wret)
						ret = wret;
				}
2819
				btrfs_mark_buffer_dirty(right);
2820
				return ret;
2821
			} else if ((extend || !data_size) && slot == 0) {
2822 2823 2824 2825 2826
				mid = 1;
			} else {
				mid = slot;
				if (mid != nritems &&
				    leaf_space_used(l, mid, nritems - mid) +
2827
				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2828 2829
					double_split = 1;
				}
2830
			}
2831 2832
		}
	}
2833 2834 2835 2836 2837 2838 2839 2840 2841
	nritems = nritems - mid;
	btrfs_set_header_nritems(right, nritems);
	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);

	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
			   btrfs_item_nr_offset(mid),
			   nritems * sizeof(struct btrfs_item));

	copy_extent_buffer(right, l,
C
Chris Mason 已提交
2842 2843 2844
		     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
		     data_copy_size, btrfs_leaf_data(l) +
		     leaf_data_end(root, l), data_copy_size);
2845

C
Chris Mason 已提交
2846
	rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2847
		      btrfs_item_end_nr(l, mid);
C
Chris Mason 已提交
2848

2849 2850
	for (i = 0; i < nritems; i++) {
		struct btrfs_item *item = btrfs_item_nr(right, i);
2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861
		u32 ioff;

		if (!right->map_token) {
			map_extent_buffer(right, (unsigned long)item,
					sizeof(struct btrfs_item),
					&right->map_token, &right->kaddr,
					&right->map_start, &right->map_len,
					KM_USER1);
		}

		ioff = btrfs_item_offset(right, item);
2862
		btrfs_set_item_offset(right, item, ioff + rt_data_off);
C
Chris Mason 已提交
2863
	}
C
Chris Mason 已提交
2864

2865 2866 2867 2868 2869
	if (right->map_token) {
		unmap_extent_buffer(right, right->map_token, KM_USER1);
		right->map_token = NULL;
	}

2870
	btrfs_set_header_nritems(l, mid);
C
Chris Mason 已提交
2871
	ret = 0;
2872
	btrfs_item_key(right, &disk_key, 0);
2873 2874
	wret = insert_ptr(trans, root, path, &disk_key, right->start,
			  path->slots[1] + 1, 1);
C
Chris Mason 已提交
2875 2876
	if (wret)
		ret = wret;
2877 2878 2879

	btrfs_mark_buffer_dirty(right);
	btrfs_mark_buffer_dirty(l);
2880
	BUG_ON(path->slots[0] != slot);
2881

Z
Zheng Yan 已提交
2882 2883 2884
	ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
	BUG_ON(ret);

2885
	if (mid <= slot) {
2886
		btrfs_tree_unlock(path->nodes[0]);
2887 2888
		free_extent_buffer(path->nodes[0]);
		path->nodes[0] = right;
2889 2890
		path->slots[0] -= mid;
		path->slots[1] += 1;
2891 2892
	} else {
		btrfs_tree_unlock(right);
2893
		free_extent_buffer(right);
2894
	}
2895

2896
	BUG_ON(path->slots[0] < 0);
2897

2898 2899 2900 2901
	if (double_split) {
		BUG_ON(num_doubles != 0);
		num_doubles++;
		goto again;
2902
	}
2903 2904 2905
	return ret;
}

2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959
/*
 * This function splits a single item into two items,
 * giving 'new_key' to the new item and splitting the
 * old one at split_offset (from the start of the item).
 *
 * The path may be released by this operation.  After
 * the split, the path is pointing to the old item.  The
 * new item is going to be in the same node as the old one.
 *
 * Note, the item being split must be smaller enough to live alone on
 * a tree block with room for one extra struct btrfs_item
 *
 * This allows us to split the item in place, keeping a lock on the
 * leaf the entire time.
 */
int btrfs_split_item(struct btrfs_trans_handle *trans,
		     struct btrfs_root *root,
		     struct btrfs_path *path,
		     struct btrfs_key *new_key,
		     unsigned long split_offset)
{
	u32 item_size;
	struct extent_buffer *leaf;
	struct btrfs_key orig_key;
	struct btrfs_item *item;
	struct btrfs_item *new_item;
	int ret = 0;
	int slot;
	u32 nritems;
	u32 orig_offset;
	struct btrfs_disk_key disk_key;
	char *buf;

	leaf = path->nodes[0];
	btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]);
	if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item))
		goto split;

	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
	btrfs_release_path(root, path);

	path->search_for_split = 1;
	path->keep_locks = 1;

	ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1);
	path->search_for_split = 0;

	/* if our item isn't there or got smaller, return now */
	if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0],
							path->slots[0])) {
		path->keep_locks = 0;
		return -EAGAIN;
	}

2960 2961
	ret = split_leaf(trans, root, &orig_key, path,
			 sizeof(struct btrfs_item), 1);
2962 2963 2964
	path->keep_locks = 0;
	BUG_ON(ret);

2965 2966 2967 2968 2969 2970
	/*
	 * make sure any changes to the path from split_leaf leave it
	 * in a blocking state
	 */
	btrfs_set_path_blocking(path);

2971
	leaf = path->nodes[0];
2972
	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029

split:
	item = btrfs_item_nr(leaf, path->slots[0]);
	orig_offset = btrfs_item_offset(leaf, item);
	item_size = btrfs_item_size(leaf, item);


	buf = kmalloc(item_size, GFP_NOFS);
	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
			    path->slots[0]), item_size);
	slot = path->slots[0] + 1;
	leaf = path->nodes[0];

	nritems = btrfs_header_nritems(leaf);

	if (slot != nritems) {
		/* shift the items */
		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
			      btrfs_item_nr_offset(slot),
			      (nritems - slot) * sizeof(struct btrfs_item));

	}

	btrfs_cpu_key_to_disk(&disk_key, new_key);
	btrfs_set_item_key(leaf, &disk_key, slot);

	new_item = btrfs_item_nr(leaf, slot);

	btrfs_set_item_offset(leaf, new_item, orig_offset);
	btrfs_set_item_size(leaf, new_item, item_size - split_offset);

	btrfs_set_item_offset(leaf, item,
			      orig_offset + item_size - split_offset);
	btrfs_set_item_size(leaf, item, split_offset);

	btrfs_set_header_nritems(leaf, nritems + 1);

	/* write the data for the start of the original item */
	write_extent_buffer(leaf, buf,
			    btrfs_item_ptr_offset(leaf, path->slots[0]),
			    split_offset);

	/* write the data for the new item */
	write_extent_buffer(leaf, buf + split_offset,
			    btrfs_item_ptr_offset(leaf, slot),
			    item_size - split_offset);
	btrfs_mark_buffer_dirty(leaf);

	ret = 0;
	if (btrfs_leaf_free_space(root, leaf) < 0) {
		btrfs_print_leaf(root, leaf);
		BUG();
	}
	kfree(buf);
	return ret;
}

C
Chris Mason 已提交
3030 3031 3032 3033 3034 3035
/*
 * make the item pointed to by the path smaller.  new_size indicates
 * how small to make it, and from_end tells us if we just chop bytes
 * off the end of the item or if we shift the item to chop bytes off
 * the front.
 */
C
Chris Mason 已提交
3036 3037 3038
int btrfs_truncate_item(struct btrfs_trans_handle *trans,
			struct btrfs_root *root,
			struct btrfs_path *path,
3039
			u32 new_size, int from_end)
C
Chris Mason 已提交
3040 3041 3042 3043
{
	int ret = 0;
	int slot;
	int slot_orig;
3044 3045
	struct extent_buffer *leaf;
	struct btrfs_item *item;
C
Chris Mason 已提交
3046 3047 3048 3049 3050 3051 3052 3053
	u32 nritems;
	unsigned int data_end;
	unsigned int old_data_start;
	unsigned int old_size;
	unsigned int size_diff;
	int i;

	slot_orig = path->slots[0];
3054
	leaf = path->nodes[0];
3055 3056 3057 3058 3059
	slot = path->slots[0];

	old_size = btrfs_item_size_nr(leaf, slot);
	if (old_size == new_size)
		return 0;
C
Chris Mason 已提交
3060

3061
	nritems = btrfs_header_nritems(leaf);
C
Chris Mason 已提交
3062 3063
	data_end = leaf_data_end(root, leaf);

3064
	old_data_start = btrfs_item_offset_nr(leaf, slot);
3065

C
Chris Mason 已提交
3066 3067 3068 3069 3070 3071 3072 3073 3074 3075
	size_diff = old_size - new_size;

	BUG_ON(slot < 0);
	BUG_ON(slot >= nritems);

	/*
	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
	 */
	/* first correct the data pointers */
	for (i = slot; i < nritems; i++) {
3076 3077
		u32 ioff;
		item = btrfs_item_nr(leaf, i);
3078 3079 3080 3081 3082 3083 3084 3085 3086

		if (!leaf->map_token) {
			map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
		}

3087 3088
		ioff = btrfs_item_offset(leaf, item);
		btrfs_set_item_offset(leaf, item, ioff + size_diff);
C
Chris Mason 已提交
3089
	}
3090 3091 3092 3093 3094 3095

	if (leaf->map_token) {
		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
		leaf->map_token = NULL;
	}

C
Chris Mason 已提交
3096
	/* shift the data */
3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119
	if (from_end) {
		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
			      data_end + size_diff, btrfs_leaf_data(leaf) +
			      data_end, old_data_start + new_size - data_end);
	} else {
		struct btrfs_disk_key disk_key;
		u64 offset;

		btrfs_item_key(leaf, &disk_key, slot);

		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
			unsigned long ptr;
			struct btrfs_file_extent_item *fi;

			fi = btrfs_item_ptr(leaf, slot,
					    struct btrfs_file_extent_item);
			fi = (struct btrfs_file_extent_item *)(
			     (unsigned long)fi - size_diff);

			if (btrfs_file_extent_type(leaf, fi) ==
			    BTRFS_FILE_EXTENT_INLINE) {
				ptr = btrfs_item_ptr_offset(leaf, slot);
				memmove_extent_buffer(leaf, ptr,
C
Chris Mason 已提交
3120 3121
				      (unsigned long)fi,
				      offsetof(struct btrfs_file_extent_item,
3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135
						 disk_bytenr));
			}
		}

		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
			      data_end + size_diff, btrfs_leaf_data(leaf) +
			      data_end, old_data_start - data_end);

		offset = btrfs_disk_key_offset(&disk_key);
		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
		btrfs_set_item_key(leaf, &disk_key, slot);
		if (slot == 0)
			fixup_low_keys(trans, root, path, &disk_key, 1);
	}
3136 3137 3138 3139

	item = btrfs_item_nr(leaf, slot);
	btrfs_set_item_size(leaf, item, new_size);
	btrfs_mark_buffer_dirty(leaf);
C
Chris Mason 已提交
3140 3141

	ret = 0;
3142 3143
	if (btrfs_leaf_free_space(root, leaf) < 0) {
		btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
3144
		BUG();
3145
	}
C
Chris Mason 已提交
3146 3147 3148
	return ret;
}

C
Chris Mason 已提交
3149 3150 3151
/*
 * make the item pointed to by the path bigger, data_size is the new size.
 */
3152 3153 3154
int btrfs_extend_item(struct btrfs_trans_handle *trans,
		      struct btrfs_root *root, struct btrfs_path *path,
		      u32 data_size)
3155 3156 3157 3158
{
	int ret = 0;
	int slot;
	int slot_orig;
3159 3160
	struct extent_buffer *leaf;
	struct btrfs_item *item;
3161 3162 3163 3164 3165 3166 3167
	u32 nritems;
	unsigned int data_end;
	unsigned int old_data;
	unsigned int old_size;
	int i;

	slot_orig = path->slots[0];
3168
	leaf = path->nodes[0];
3169

3170
	nritems = btrfs_header_nritems(leaf);
3171 3172
	data_end = leaf_data_end(root, leaf);

3173 3174
	if (btrfs_leaf_free_space(root, leaf) < data_size) {
		btrfs_print_leaf(root, leaf);
3175
		BUG();
3176
	}
3177
	slot = path->slots[0];
3178
	old_data = btrfs_item_end_nr(leaf, slot);
3179 3180

	BUG_ON(slot < 0);
3181 3182
	if (slot >= nritems) {
		btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
3183 3184
		printk(KERN_CRIT "slot %d too large, nritems %d\n",
		       slot, nritems);
3185 3186
		BUG_ON(1);
	}
3187 3188 3189 3190 3191 3192

	/*
	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
	 */
	/* first correct the data pointers */
	for (i = slot; i < nritems; i++) {
3193 3194
		u32 ioff;
		item = btrfs_item_nr(leaf, i);
3195 3196 3197 3198 3199 3200 3201 3202

		if (!leaf->map_token) {
			map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
		}
3203 3204
		ioff = btrfs_item_offset(leaf, item);
		btrfs_set_item_offset(leaf, item, ioff - data_size);
3205
	}
3206

3207 3208 3209 3210 3211
	if (leaf->map_token) {
		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
		leaf->map_token = NULL;
	}

3212
	/* shift the data */
3213
	memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3214 3215
		      data_end - data_size, btrfs_leaf_data(leaf) +
		      data_end, old_data - data_end);
3216

3217
	data_end = old_data;
3218 3219 3220 3221
	old_size = btrfs_item_size_nr(leaf, slot);
	item = btrfs_item_nr(leaf, slot);
	btrfs_set_item_size(leaf, item, old_size + data_size);
	btrfs_mark_buffer_dirty(leaf);
3222 3223

	ret = 0;
3224 3225
	if (btrfs_leaf_free_space(root, leaf) < 0) {
		btrfs_print_leaf(root, leaf);
3226
		BUG();
3227
	}
3228 3229 3230
	return ret;
}

3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253
/*
 * Given a key and some data, insert items into the tree.
 * This does all the path init required, making room in the tree if needed.
 * Returns the number of keys that were inserted.
 */
int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root,
			    struct btrfs_path *path,
			    struct btrfs_key *cpu_key, u32 *data_size,
			    int nr)
{
	struct extent_buffer *leaf;
	struct btrfs_item *item;
	int ret = 0;
	int slot;
	int i;
	u32 nritems;
	u32 total_data = 0;
	u32 total_size = 0;
	unsigned int data_end;
	struct btrfs_disk_key disk_key;
	struct btrfs_key found_key;

3254 3255 3256 3257 3258 3259
	for (i = 0; i < nr; i++) {
		if (total_size + data_size[i] + sizeof(struct btrfs_item) >
		    BTRFS_LEAF_DATA_SIZE(root)) {
			break;
			nr = i;
		}
3260
		total_data += data_size[i];
3261 3262 3263
		total_size += data_size[i] + sizeof(struct btrfs_item);
	}
	BUG_ON(nr == 0);
3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305

	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
	if (ret == 0)
		return -EEXIST;
	if (ret < 0)
		goto out;

	leaf = path->nodes[0];

	nritems = btrfs_header_nritems(leaf);
	data_end = leaf_data_end(root, leaf);

	if (btrfs_leaf_free_space(root, leaf) < total_size) {
		for (i = nr; i >= 0; i--) {
			total_data -= data_size[i];
			total_size -= data_size[i] + sizeof(struct btrfs_item);
			if (total_size < btrfs_leaf_free_space(root, leaf))
				break;
		}
		nr = i;
	}

	slot = path->slots[0];
	BUG_ON(slot < 0);

	if (slot != nritems) {
		unsigned int old_data = btrfs_item_end_nr(leaf, slot);

		item = btrfs_item_nr(leaf, slot);
		btrfs_item_key_to_cpu(leaf, &found_key, slot);

		/* figure out how many keys we can insert in here */
		total_data = data_size[0];
		for (i = 1; i < nr; i++) {
			if (comp_cpu_keys(&found_key, cpu_key + i) <= 0)
				break;
			total_data += data_size[i];
		}
		nr = i;

		if (old_data < data_end) {
			btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
3306
			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382
			       slot, old_data, data_end);
			BUG_ON(1);
		}
		/*
		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
		 */
		/* first correct the data pointers */
		WARN_ON(leaf->map_token);
		for (i = slot; i < nritems; i++) {
			u32 ioff;

			item = btrfs_item_nr(leaf, i);
			if (!leaf->map_token) {
				map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
			}

			ioff = btrfs_item_offset(leaf, item);
			btrfs_set_item_offset(leaf, item, ioff - total_data);
		}
		if (leaf->map_token) {
			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
			leaf->map_token = NULL;
		}

		/* shift the items */
		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
			      btrfs_item_nr_offset(slot),
			      (nritems - slot) * sizeof(struct btrfs_item));

		/* shift the data */
		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
			      data_end - total_data, btrfs_leaf_data(leaf) +
			      data_end, old_data - data_end);
		data_end = old_data;
	} else {
		/*
		 * this sucks but it has to be done, if we are inserting at
		 * the end of the leaf only insert 1 of the items, since we
		 * have no way of knowing whats on the next leaf and we'd have
		 * to drop our current locks to figure it out
		 */
		nr = 1;
	}

	/* setup the item for the new data */
	for (i = 0; i < nr; i++) {
		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
		btrfs_set_item_key(leaf, &disk_key, slot + i);
		item = btrfs_item_nr(leaf, slot + i);
		btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
		data_end -= data_size[i];
		btrfs_set_item_size(leaf, item, data_size[i]);
	}
	btrfs_set_header_nritems(leaf, nritems + nr);
	btrfs_mark_buffer_dirty(leaf);

	ret = 0;
	if (slot == 0) {
		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
	}

	if (btrfs_leaf_free_space(root, leaf) < 0) {
		btrfs_print_leaf(root, leaf);
		BUG();
	}
out:
	if (!ret)
		ret = nr;
	return ret;
}

C
Chris Mason 已提交
3383
/*
C
Chris Mason 已提交
3384
 * Given a key and some data, insert items into the tree.
C
Chris Mason 已提交
3385 3386
 * This does all the path init required, making room in the tree if needed.
 */
3387
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3388 3389
			    struct btrfs_root *root,
			    struct btrfs_path *path,
3390 3391
			    struct btrfs_key *cpu_key, u32 *data_size,
			    int nr)
3392
{
3393 3394
	struct extent_buffer *leaf;
	struct btrfs_item *item;
C
Chris Mason 已提交
3395
	int ret = 0;
3396
	int slot;
3397
	int slot_orig;
3398
	int i;
3399
	u32 nritems;
3400 3401
	u32 total_size = 0;
	u32 total_data = 0;
3402
	unsigned int data_end;
C
Chris Mason 已提交
3403 3404
	struct btrfs_disk_key disk_key;

C
Chris Mason 已提交
3405
	for (i = 0; i < nr; i++)
3406
		total_data += data_size[i];
3407

3408
	total_size = total_data + (nr * sizeof(struct btrfs_item));
3409
	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
J
Josef Bacik 已提交
3410
	if (ret == 0)
3411
		return -EEXIST;
3412 3413
	if (ret < 0)
		goto out;
3414

3415
	slot_orig = path->slots[0];
3416
	leaf = path->nodes[0];
C
Chris Mason 已提交
3417

3418
	nritems = btrfs_header_nritems(leaf);
C
Chris Mason 已提交
3419
	data_end = leaf_data_end(root, leaf);
3420

3421
	if (btrfs_leaf_free_space(root, leaf) < total_size) {
3422
		btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
3423
		printk(KERN_CRIT "not enough freespace need %u have %d\n",
3424
		       total_size, btrfs_leaf_free_space(root, leaf));
3425
		BUG();
3426
	}
3427

3428
	slot = path->slots[0];
3429
	BUG_ON(slot < 0);
3430

3431
	if (slot != nritems) {
3432
		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3433

3434 3435
		if (old_data < data_end) {
			btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
3436
			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3437 3438 3439
			       slot, old_data, data_end);
			BUG_ON(1);
		}
3440 3441 3442 3443
		/*
		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
		 */
		/* first correct the data pointers */
3444
		WARN_ON(leaf->map_token);
C
Chris Mason 已提交
3445
		for (i = slot; i < nritems; i++) {
3446
			u32 ioff;
3447

3448
			item = btrfs_item_nr(leaf, i);
3449 3450 3451 3452 3453 3454 3455 3456
			if (!leaf->map_token) {
				map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
			}

3457
			ioff = btrfs_item_offset(leaf, item);
3458
			btrfs_set_item_offset(leaf, item, ioff - total_data);
C
Chris Mason 已提交
3459
		}
3460 3461 3462 3463
		if (leaf->map_token) {
			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
			leaf->map_token = NULL;
		}
3464 3465

		/* shift the items */
3466
		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3467
			      btrfs_item_nr_offset(slot),
C
Chris Mason 已提交
3468
			      (nritems - slot) * sizeof(struct btrfs_item));
3469 3470

		/* shift the data */
3471
		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3472
			      data_end - total_data, btrfs_leaf_data(leaf) +
C
Chris Mason 已提交
3473
			      data_end, old_data - data_end);
3474 3475
		data_end = old_data;
	}
3476

3477
	/* setup the item for the new data */
3478 3479 3480 3481 3482 3483 3484 3485 3486
	for (i = 0; i < nr; i++) {
		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
		btrfs_set_item_key(leaf, &disk_key, slot + i);
		item = btrfs_item_nr(leaf, slot + i);
		btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
		data_end -= data_size[i];
		btrfs_set_item_size(leaf, item, data_size[i]);
	}
	btrfs_set_header_nritems(leaf, nritems + nr);
3487
	btrfs_mark_buffer_dirty(leaf);
C
Chris Mason 已提交
3488 3489

	ret = 0;
3490 3491
	if (slot == 0) {
		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3492
		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3493
	}
C
Chris Mason 已提交
3494

3495 3496
	if (btrfs_leaf_free_space(root, leaf) < 0) {
		btrfs_print_leaf(root, leaf);
3497
		BUG();
3498
	}
3499
out:
3500
	btrfs_unlock_up_safe(path, 1);
3501 3502 3503 3504 3505 3506 3507
	return ret;
}

/*
 * Given a key and some data, insert an item into the tree.
 * This does all the path init required, making room in the tree if needed.
 */
3508 3509 3510
int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
		      *root, struct btrfs_key *cpu_key, void *data, u32
		      data_size)
3511 3512
{
	int ret = 0;
C
Chris Mason 已提交
3513
	struct btrfs_path *path;
3514 3515
	struct extent_buffer *leaf;
	unsigned long ptr;
3516

C
Chris Mason 已提交
3517 3518 3519
	path = btrfs_alloc_path();
	BUG_ON(!path);
	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3520
	if (!ret) {
3521 3522 3523 3524
		leaf = path->nodes[0];
		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
		write_extent_buffer(leaf, data, ptr, data_size);
		btrfs_mark_buffer_dirty(leaf);
3525
	}
C
Chris Mason 已提交
3526
	btrfs_free_path(path);
C
Chris Mason 已提交
3527
	return ret;
3528 3529
}

C
Chris Mason 已提交
3530
/*
C
Chris Mason 已提交
3531
 * delete the pointer from a given node.
C
Chris Mason 已提交
3532
 *
C
Chris Mason 已提交
3533 3534
 * the tree should have been previously balanced so the deletion does not
 * empty a node.
C
Chris Mason 已提交
3535
 */
3536 3537
static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		   struct btrfs_path *path, int level, int slot)
3538
{
3539
	struct extent_buffer *parent = path->nodes[level];
3540
	u32 nritems;
C
Chris Mason 已提交
3541
	int ret = 0;
3542
	int wret;
3543

3544
	nritems = btrfs_header_nritems(parent);
C
Chris Mason 已提交
3545
	if (slot != nritems - 1) {
3546 3547 3548
		memmove_extent_buffer(parent,
			      btrfs_node_key_ptr_offset(slot),
			      btrfs_node_key_ptr_offset(slot + 1),
C
Chris Mason 已提交
3549 3550
			      sizeof(struct btrfs_key_ptr) *
			      (nritems - slot - 1));
3551
	}
3552
	nritems--;
3553
	btrfs_set_header_nritems(parent, nritems);
3554
	if (nritems == 0 && parent == root->node) {
3555
		BUG_ON(btrfs_header_level(root->node) != 1);
3556
		/* just turn the root into a leaf and break */
3557
		btrfs_set_header_level(root->node, 0);
3558
	} else if (slot == 0) {
3559 3560 3561 3562
		struct btrfs_disk_key disk_key;

		btrfs_node_key(parent, &disk_key, 0);
		wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
C
Chris Mason 已提交
3563 3564
		if (wret)
			ret = wret;
3565
	}
C
Chris Mason 已提交
3566
	btrfs_mark_buffer_dirty(parent);
C
Chris Mason 已提交
3567
	return ret;
3568 3569
}

3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587
/*
 * a helper function to delete the leaf pointed to by path->slots[1] and
 * path->nodes[1].  bytenr is the node block pointer, but since the callers
 * already know it, it is faster to have them pass it down than to
 * read it out of the node again.
 *
 * This deletes the pointer in path->nodes[1] and frees the leaf
 * block extent.  zero is returned if it all worked out, < 0 otherwise.
 *
 * The path must have already been setup for deleting the leaf, including
 * all the proper balancing.  path->nodes[1] must be locked.
 */
noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root,
			    struct btrfs_path *path, u64 bytenr)
{
	int ret;
	u64 root_gen = btrfs_header_generation(path->nodes[1]);
3588 3589
	u64 parent_start = path->nodes[1]->start;
	u64 parent_owner = btrfs_header_owner(path->nodes[1]);
3590 3591 3592 3593 3594

	ret = del_ptr(trans, root, path, 1, path->slots[1]);
	if (ret)
		return ret;

3595 3596 3597 3598 3599 3600
	/*
	 * btrfs_free_extent is expensive, we want to make sure we
	 * aren't holding any locks when we call it
	 */
	btrfs_unlock_up_safe(path, 0);

3601 3602
	ret = btrfs_free_extent(trans, root, bytenr,
				btrfs_level_size(root, 0),
3603
				parent_start, parent_owner,
3604
				root_gen, 0, 1);
3605 3606
	return ret;
}
C
Chris Mason 已提交
3607 3608 3609 3610
/*
 * delete the item at the leaf level in path.  If that empties
 * the leaf, remove it from the tree
 */
3611 3612
int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		    struct btrfs_path *path, int slot, int nr)
3613
{
3614 3615
	struct extent_buffer *leaf;
	struct btrfs_item *item;
3616 3617
	int last_off;
	int dsize = 0;
C
Chris Mason 已提交
3618 3619
	int ret = 0;
	int wret;
3620
	int i;
3621
	u32 nritems;
3622

3623
	leaf = path->nodes[0];
3624 3625 3626 3627 3628
	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);

	for (i = 0; i < nr; i++)
		dsize += btrfs_item_size_nr(leaf, slot + i);

3629
	nritems = btrfs_header_nritems(leaf);
3630

3631
	if (slot + nr != nritems) {
C
Chris Mason 已提交
3632
		int data_end = leaf_data_end(root, leaf);
3633 3634

		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
C
Chris Mason 已提交
3635 3636
			      data_end + dsize,
			      btrfs_leaf_data(leaf) + data_end,
3637
			      last_off - data_end);
3638

3639
		for (i = slot + nr; i < nritems; i++) {
3640
			u32 ioff;
3641

3642
			item = btrfs_item_nr(leaf, i);
3643 3644 3645 3646 3647 3648 3649
			if (!leaf->map_token) {
				map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
			}
3650 3651
			ioff = btrfs_item_offset(leaf, item);
			btrfs_set_item_offset(leaf, item, ioff + dsize);
C
Chris Mason 已提交
3652
		}
3653 3654 3655 3656 3657 3658

		if (leaf->map_token) {
			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
			leaf->map_token = NULL;
		}

3659
		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
3660
			      btrfs_item_nr_offset(slot + nr),
C
Chris Mason 已提交
3661
			      sizeof(struct btrfs_item) *
3662
			      (nritems - slot - nr));
3663
	}
3664 3665
	btrfs_set_header_nritems(leaf, nritems - nr);
	nritems -= nr;
3666

C
Chris Mason 已提交
3667
	/* delete the leaf if we've emptied it */
3668
	if (nritems == 0) {
3669 3670
		if (leaf == root->node) {
			btrfs_set_header_level(leaf, 0);
3671
		} else {
3672 3673
			ret = btrfs_del_leaf(trans, root, path, leaf->start);
			BUG_ON(ret);
3674
		}
3675
	} else {
3676
		int used = leaf_space_used(leaf, 0, nritems);
C
Chris Mason 已提交
3677
		if (slot == 0) {
3678 3679 3680
			struct btrfs_disk_key disk_key;

			btrfs_item_key(leaf, &disk_key, 0);
3681
			wret = fixup_low_keys(trans, root, path,
3682
					      &disk_key, 1);
C
Chris Mason 已提交
3683 3684 3685 3686
			if (wret)
				ret = wret;
		}

C
Chris Mason 已提交
3687
		/* delete the leaf if it is mostly empty */
3688
		if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
3689 3690 3691 3692
			/* push_leaf_left fixes the path.
			 * make sure the path still points to our leaf
			 * for possible call to del_ptr below
			 */
3693
			slot = path->slots[1];
3694 3695
			extent_buffer_get(leaf);

3696
			wret = push_leaf_left(trans, root, path, 1, 1);
3697
			if (wret < 0 && wret != -ENOSPC)
C
Chris Mason 已提交
3698
				ret = wret;
3699 3700 3701

			if (path->nodes[0] == leaf &&
			    btrfs_header_nritems(leaf)) {
3702
				wret = push_leaf_right(trans, root, path, 1, 1);
3703
				if (wret < 0 && wret != -ENOSPC)
C
Chris Mason 已提交
3704 3705
					ret = wret;
			}
3706 3707

			if (btrfs_header_nritems(leaf) == 0) {
3708
				path->slots[1] = slot;
C
Chris Mason 已提交
3709 3710
				ret = btrfs_del_leaf(trans, root, path,
						     leaf->start);
3711
				BUG_ON(ret);
3712
				free_extent_buffer(leaf);
C
Chris Mason 已提交
3713
			} else {
3714 3715 3716 3717 3718 3719 3720
				/* if we're still in the path, make sure
				 * we're dirty.  Otherwise, one of the
				 * push_leaf functions must have already
				 * dirtied this buffer
				 */
				if (path->nodes[0] == leaf)
					btrfs_mark_buffer_dirty(leaf);
3721
				free_extent_buffer(leaf);
3722
			}
3723
		} else {
3724
			btrfs_mark_buffer_dirty(leaf);
3725 3726
		}
	}
C
Chris Mason 已提交
3727
	return ret;
3728 3729
}

3730
/*
3731
 * search the tree again to find a leaf with lesser keys
3732 3733
 * returns 0 if it found something or 1 if there are no lesser leaves.
 * returns < 0 on io errors.
C
Chris Mason 已提交
3734 3735 3736
 *
 * This may release the path, and so you may lose any locks held at the
 * time you call it.
3737 3738 3739
 */
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
{
3740 3741 3742
	struct btrfs_key key;
	struct btrfs_disk_key found_key;
	int ret;
3743

3744
	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
3745

3746 3747 3748 3749 3750 3751 3752 3753
	if (key.offset > 0)
		key.offset--;
	else if (key.type > 0)
		key.type--;
	else if (key.objectid > 0)
		key.objectid--;
	else
		return 1;
3754

3755 3756 3757 3758 3759 3760 3761 3762 3763
	btrfs_release_path(root, path);
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0)
		return ret;
	btrfs_item_key(path->nodes[0], &found_key, 0);
	ret = comp_keys(&found_key, &key);
	if (ret < 0)
		return 0;
	return 1;
3764 3765
}

3766 3767 3768
/*
 * A helper function to walk down the tree starting at min_key, and looking
 * for nodes or leaves that are either in cache or have a minimum
C
Chris Mason 已提交
3769
 * transaction id.  This is used by the btree defrag code, and tree logging
3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780
 *
 * This does not cow, but it does stuff the starting key it finds back
 * into min_key, so you can call btrfs_search_slot with cow=1 on the
 * key and get a writable path.
 *
 * This does lock as it descends, and path->keep_locks should be set
 * to 1 by the caller.
 *
 * This honors path->lowest_level to prevent descent past a given level
 * of the tree.
 *
C
Chris Mason 已提交
3781 3782 3783 3784
 * min_trans indicates the oldest transaction that you are interested
 * in walking through.  Any nodes or leaves older than min_trans are
 * skipped over (without reading them).
 *
3785 3786 3787 3788
 * returns zero if something useful was found, < 0 on error and 1 if there
 * was nothing in the tree that matched the search criteria.
 */
int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
3789
			 struct btrfs_key *max_key,
3790 3791 3792 3793 3794 3795
			 struct btrfs_path *path, int cache_only,
			 u64 min_trans)
{
	struct extent_buffer *cur;
	struct btrfs_key found_key;
	int slot;
3796
	int sret;
3797 3798 3799 3800
	u32 nritems;
	int level;
	int ret = 1;

3801
	WARN_ON(!path->keep_locks);
3802 3803 3804
again:
	cur = btrfs_lock_root_node(root);
	level = btrfs_header_level(cur);
3805
	WARN_ON(path->nodes[level]);
3806 3807 3808 3809 3810 3811 3812
	path->nodes[level] = cur;
	path->locks[level] = 1;

	if (btrfs_header_generation(cur) < min_trans) {
		ret = 1;
		goto out;
	}
C
Chris Mason 已提交
3813
	while (1) {
3814 3815
		nritems = btrfs_header_nritems(cur);
		level = btrfs_header_level(cur);
3816
		sret = bin_search(cur, min_key, level, &slot);
3817

3818 3819
		/* at the lowest level, we're done, setup the path and exit */
		if (level == path->lowest_level) {
3820 3821
			if (slot >= nritems)
				goto find_next_key;
3822 3823 3824 3825 3826
			ret = 0;
			path->slots[level] = slot;
			btrfs_item_key_to_cpu(cur, &found_key, slot);
			goto out;
		}
3827 3828
		if (sret && slot > 0)
			slot--;
3829 3830 3831 3832 3833
		/*
		 * check this node pointer against the cache_only and
		 * min_trans parameters.  If it isn't in cache or is too
		 * old, skip to the next one.
		 */
C
Chris Mason 已提交
3834
		while (slot < nritems) {
3835 3836 3837
			u64 blockptr;
			u64 gen;
			struct extent_buffer *tmp;
3838 3839
			struct btrfs_disk_key disk_key;

3840 3841 3842 3843 3844 3845 3846 3847 3848
			blockptr = btrfs_node_blockptr(cur, slot);
			gen = btrfs_node_ptr_generation(cur, slot);
			if (gen < min_trans) {
				slot++;
				continue;
			}
			if (!cache_only)
				break;

3849 3850 3851 3852 3853 3854 3855 3856
			if (max_key) {
				btrfs_node_key(cur, &disk_key, slot);
				if (comp_keys(&disk_key, max_key) >= 0) {
					ret = 1;
					goto out;
				}
			}

3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867
			tmp = btrfs_find_tree_block(root, blockptr,
					    btrfs_level_size(root, level - 1));

			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
				free_extent_buffer(tmp);
				break;
			}
			if (tmp)
				free_extent_buffer(tmp);
			slot++;
		}
3868
find_next_key:
3869 3870 3871 3872 3873
		/*
		 * we didn't find a candidate key in this node, walk forward
		 * and find another one
		 */
		if (slot >= nritems) {
3874
			path->slots[level] = slot;
3875
			btrfs_set_path_blocking(path);
3876
			sret = btrfs_find_next_key(root, path, min_key, level,
3877
						  cache_only, min_trans);
3878
			if (sret == 0) {
3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892
				btrfs_release_path(root, path);
				goto again;
			} else {
				goto out;
			}
		}
		/* save our key for returning back */
		btrfs_node_key_to_cpu(cur, &found_key, slot);
		path->slots[level] = slot;
		if (level == path->lowest_level) {
			ret = 0;
			unlock_up(path, level, 1);
			goto out;
		}
3893
		btrfs_set_path_blocking(path);
3894 3895 3896
		cur = read_node_slot(root, cur, slot);

		btrfs_tree_lock(cur);
3897

3898 3899 3900
		path->locks[level - 1] = 1;
		path->nodes[level - 1] = cur;
		unlock_up(path, level, 1);
3901
		btrfs_clear_path_blocking(path, NULL);
3902 3903 3904 3905
	}
out:
	if (ret == 0)
		memcpy(min_key, &found_key, sizeof(found_key));
3906
	btrfs_set_path_blocking(path);
3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921
	return ret;
}

/*
 * this is similar to btrfs_next_leaf, but does not try to preserve
 * and fixup the path.  It looks for and returns the next key in the
 * tree based on the current path and the cache_only and min_trans
 * parameters.
 *
 * 0 is returned if another key is found, < 0 if there are any errors
 * and 1 is returned if there are no higher keys in the tree
 *
 * path->keep_locks should be set to 1 on the search made before
 * calling this function.
 */
3922
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
3923 3924
			struct btrfs_key *key, int lowest_level,
			int cache_only, u64 min_trans)
3925 3926 3927 3928 3929
{
	int level = lowest_level;
	int slot;
	struct extent_buffer *c;

3930
	WARN_ON(!path->keep_locks);
C
Chris Mason 已提交
3931
	while (level < BTRFS_MAX_LEVEL) {
3932 3933 3934 3935 3936
		if (!path->nodes[level])
			return 1;

		slot = path->slots[level] + 1;
		c = path->nodes[level];
3937
next:
3938 3939
		if (slot >= btrfs_header_nritems(c)) {
			level++;
C
Chris Mason 已提交
3940
			if (level == BTRFS_MAX_LEVEL)
3941 3942 3943 3944 3945
				return 1;
			continue;
		}
		if (level == 0)
			btrfs_item_key_to_cpu(c, key, slot);
3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965
		else {
			u64 blockptr = btrfs_node_blockptr(c, slot);
			u64 gen = btrfs_node_ptr_generation(c, slot);

			if (cache_only) {
				struct extent_buffer *cur;
				cur = btrfs_find_tree_block(root, blockptr,
					    btrfs_level_size(root, level - 1));
				if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
					slot++;
					if (cur)
						free_extent_buffer(cur);
					goto next;
				}
				free_extent_buffer(cur);
			}
			if (gen < min_trans) {
				slot++;
				goto next;
			}
3966
			btrfs_node_key_to_cpu(c, key, slot);
3967
		}
3968 3969 3970 3971 3972
		return 0;
	}
	return 1;
}

C
Chris Mason 已提交
3973
/*
3974
 * search the tree again to find a leaf with greater keys
C
Chris Mason 已提交
3975 3976
 * returns 0 if it found something or 1 if there are no greater leaves.
 * returns < 0 on io errors.
C
Chris Mason 已提交
3977
 */
C
Chris Mason 已提交
3978
int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3979 3980 3981
{
	int slot;
	int level = 1;
3982 3983
	struct extent_buffer *c;
	struct extent_buffer *next = NULL;
3984 3985 3986 3987 3988
	struct btrfs_key key;
	u32 nritems;
	int ret;

	nritems = btrfs_header_nritems(path->nodes[0]);
C
Chris Mason 已提交
3989
	if (nritems == 0)
3990 3991 3992 3993 3994
		return 1;

	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);

	btrfs_release_path(root, path);
3995
	path->keep_locks = 1;
3996 3997 3998 3999 4000 4001
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	path->keep_locks = 0;

	if (ret < 0)
		return ret;

4002
	btrfs_set_path_blocking(path);
4003
	nritems = btrfs_header_nritems(path->nodes[0]);
4004 4005 4006 4007 4008 4009
	/*
	 * by releasing the path above we dropped all our locks.  A balance
	 * could have added more items next to the key that used to be
	 * at the very end of the block.  So, check again here and
	 * advance the path if there are now more items available.
	 */
4010
	if (nritems > 0 && path->slots[0] < nritems - 1) {
4011
		path->slots[0]++;
4012 4013
		goto done;
	}
4014

C
Chris Mason 已提交
4015
	while (level < BTRFS_MAX_LEVEL) {
4016
		if (!path->nodes[level])
C
Chris Mason 已提交
4017
			return 1;
4018

4019 4020
		slot = path->slots[level] + 1;
		c = path->nodes[level];
4021
		if (slot >= btrfs_header_nritems(c)) {
4022
			level++;
C
Chris Mason 已提交
4023
			if (level == BTRFS_MAX_LEVEL)
4024
				return 1;
4025 4026
			continue;
		}
4027

4028 4029
		if (next) {
			btrfs_tree_unlock(next);
4030
			free_extent_buffer(next);
4031
		}
4032

4033
		/* the path was set to blocking above */
4034 4035
		if (level == 1 && (path->locks[1] || path->skip_locking) &&
		    path->reada)
4036
			reada_for_search(root, path, level, slot, 0);
4037

4038
		next = read_node_slot(root, c, slot);
4039
		if (!path->skip_locking) {
4040
			btrfs_assert_tree_locked(c);
4041
			btrfs_tree_lock(next);
4042
			btrfs_set_lock_blocking(next);
4043
		}
4044 4045 4046
		break;
	}
	path->slots[level] = slot;
C
Chris Mason 已提交
4047
	while (1) {
4048 4049
		level--;
		c = path->nodes[level];
4050 4051
		if (path->locks[level])
			btrfs_tree_unlock(c);
4052
		free_extent_buffer(c);
4053 4054
		path->nodes[level] = next;
		path->slots[level] = 0;
4055 4056
		if (!path->skip_locking)
			path->locks[level] = 1;
4057 4058
		if (!level)
			break;
4059 4060

		btrfs_set_path_blocking(path);
4061 4062
		if (level == 1 && path->locks[1] && path->reada)
			reada_for_search(root, path, level, slot, 0);
4063
		next = read_node_slot(root, next, 0);
4064
		if (!path->skip_locking) {
4065
			btrfs_assert_tree_locked(path->nodes[level]);
4066
			btrfs_tree_lock(next);
4067
			btrfs_set_lock_blocking(next);
4068
		}
4069
	}
4070 4071
done:
	unlock_up(path, 0, 1);
4072 4073
	return 0;
}
4074

4075 4076 4077 4078 4079 4080
/*
 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
 * searching until it gets past min_objectid or finds an item of 'type'
 *
 * returns 0 if something is found, 1 if nothing was found and < 0 on error
 */
4081 4082 4083 4084 4085 4086
int btrfs_previous_item(struct btrfs_root *root,
			struct btrfs_path *path, u64 min_objectid,
			int type)
{
	struct btrfs_key found_key;
	struct extent_buffer *leaf;
4087
	u32 nritems;
4088 4089
	int ret;

C
Chris Mason 已提交
4090
	while (1) {
4091
		if (path->slots[0] == 0) {
4092
			btrfs_set_path_blocking(path);
4093 4094 4095 4096 4097 4098 4099
			ret = btrfs_prev_leaf(root, path);
			if (ret != 0)
				return ret;
		} else {
			path->slots[0]--;
		}
		leaf = path->nodes[0];
4100 4101 4102 4103 4104 4105
		nritems = btrfs_header_nritems(leaf);
		if (nritems == 0)
			return 1;
		if (path->slots[0] == nritems)
			path->slots[0]--;

4106 4107 4108
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
		if (found_key.type == type)
			return 0;
4109 4110 4111 4112 4113
		if (found_key.objectid < min_objectid)
			break;
		if (found_key.objectid == min_objectid &&
		    found_key.type < type)
			break;
4114 4115 4116
	}
	return 1;
}