ctree.c 107.0 KB
Newer Older
C
Chris Mason 已提交
1
/*
C
Chris Mason 已提交
2
 * Copyright (C) 2007,2008 Oracle.  All rights reserved.
C
Chris Mason 已提交
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

19
#include <linux/sched.h>
20 21
#include "ctree.h"
#include "disk-io.h"
22
#include "transaction.h"
23
#include "print-tree.h"
24
#include "locking.h"
25

26 27 28
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
		      *root, struct btrfs_path *path, int level);
static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
29
		      *root, struct btrfs_key *ins_key,
30
		      struct btrfs_path *path, int data_size, int extend);
31 32
static int push_node_left(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root, struct extent_buffer *dst,
33
			  struct extent_buffer *src, int empty);
34 35 36 37
static int balance_node_right(struct btrfs_trans_handle *trans,
			      struct btrfs_root *root,
			      struct extent_buffer *dst_buf,
			      struct extent_buffer *src_buf);
38 39
static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		   struct btrfs_path *path, int level, int slot);
40

C
Chris Mason 已提交
41
struct btrfs_path *btrfs_alloc_path(void)
C
Chris Mason 已提交
42
{
C
Chris Mason 已提交
43
	struct btrfs_path *path;
J
Jeff Mahoney 已提交
44 45
	path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
	if (path)
46
		path->reada = 1;
C
Chris Mason 已提交
47
	return path;
C
Chris Mason 已提交
48 49
}

50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
/*
 * set all locked nodes in the path to blocking locks.  This should
 * be done before scheduling
 */
noinline void btrfs_set_path_blocking(struct btrfs_path *p)
{
	int i;
	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
		if (p->nodes[i] && p->locks[i])
			btrfs_set_lock_blocking(p->nodes[i]);
	}
}

/*
 * reset all the locked nodes in the patch to spinning locks.
65 66 67 68 69
 *
 * held is used to keep lockdep happy, when lockdep is enabled
 * we set held to a blocking lock before we go around and
 * retake all the spinlocks in the path.  You can safely use NULL
 * for held
70
 */
71 72
noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
					struct extent_buffer *held)
73 74
{
	int i;
75 76 77 78 79 80 81 82 83 84 85 86 87 88

#ifdef CONFIG_DEBUG_LOCK_ALLOC
	/* lockdep really cares that we take all of these spinlocks
	 * in the right order.  If any of the locks in the path are not
	 * currently blocking, it is going to complain.  So, make really
	 * really sure by forcing the path to blocking before we clear
	 * the path blocking.
	 */
	if (held)
		btrfs_set_lock_blocking(held);
	btrfs_set_path_blocking(p);
#endif

	for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
89 90 91
		if (p->nodes[i] && p->locks[i])
			btrfs_clear_lock_blocking(p->nodes[i]);
	}
92 93 94 95 96

#ifdef CONFIG_DEBUG_LOCK_ALLOC
	if (held)
		btrfs_clear_lock_blocking(held);
#endif
97 98
}

C
Chris Mason 已提交
99
/* this also releases the path */
C
Chris Mason 已提交
100
void btrfs_free_path(struct btrfs_path *p)
101
{
C
Chris Mason 已提交
102 103
	btrfs_release_path(NULL, p);
	kmem_cache_free(btrfs_path_cachep, p);
104 105
}

C
Chris Mason 已提交
106 107 108 109 110 111
/*
 * path release drops references on the extent buffers in the path
 * and it drops any locks held by this path
 *
 * It is safe to call this on paths that no locks or extent buffers held.
 */
C
Chris Mason 已提交
112
noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
113 114
{
	int i;
115

C
Chris Mason 已提交
116
	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
117
		p->slots[i] = 0;
118
		if (!p->nodes[i])
119 120 121 122 123
			continue;
		if (p->locks[i]) {
			btrfs_tree_unlock(p->nodes[i]);
			p->locks[i] = 0;
		}
124
		free_extent_buffer(p->nodes[i]);
125
		p->nodes[i] = NULL;
126 127 128
	}
}

C
Chris Mason 已提交
129 130 131 132 133 134 135 136 137 138
/*
 * safely gets a reference on the root node of a tree.  A lock
 * is not taken, so a concurrent writer may put a different node
 * at the root of the tree.  See btrfs_lock_root_node for the
 * looping required.
 *
 * The extent buffer returned by this has a reference taken, so
 * it won't disappear.  It may stop being the root of the tree
 * at any time because there are no locks held.
 */
139 140 141 142 143 144 145 146 147 148
struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
{
	struct extent_buffer *eb;
	spin_lock(&root->node_lock);
	eb = root->node;
	extent_buffer_get(eb);
	spin_unlock(&root->node_lock);
	return eb;
}

C
Chris Mason 已提交
149 150 151 152
/* loop around taking references on and locking the root node of the
 * tree until you end up with a lock on the root.  A locked buffer
 * is returned, with a reference held.
 */
153 154 155 156
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
{
	struct extent_buffer *eb;

C
Chris Mason 已提交
157
	while (1) {
158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173
		eb = btrfs_root_node(root);
		btrfs_tree_lock(eb);

		spin_lock(&root->node_lock);
		if (eb == root->node) {
			spin_unlock(&root->node_lock);
			break;
		}
		spin_unlock(&root->node_lock);

		btrfs_tree_unlock(eb);
		free_extent_buffer(eb);
	}
	return eb;
}

C
Chris Mason 已提交
174 175 176 177
/* cowonly root (everything not a reference counted cow subvolume), just get
 * put onto a simple dirty list.  transaction.c walks this to make sure they
 * get properly updated on disk.
 */
178 179 180 181 182 183 184 185
static void add_root_to_dirty_list(struct btrfs_root *root)
{
	if (root->track_dirty && list_empty(&root->dirty_list)) {
		list_add(&root->dirty_list,
			 &root->fs_info->dirty_cowonly_roots);
	}
}

C
Chris Mason 已提交
186 187 188 189 190
/*
 * used by snapshot creation to make a copy of a root for a tree with
 * a given objectid.  The buffer with the new root node is returned in
 * cow_ret, and this func returns zero on success or a negative error code.
 */
191 192 193 194 195 196 197 198 199
int btrfs_copy_root(struct btrfs_trans_handle *trans,
		      struct btrfs_root *root,
		      struct extent_buffer *buf,
		      struct extent_buffer **cow_ret, u64 new_root_objectid)
{
	struct extent_buffer *cow;
	u32 nritems;
	int ret = 0;
	int level;
200
	struct btrfs_root *new_root;
201

202 203 204 205 206 207
	new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
	if (!new_root)
		return -ENOMEM;

	memcpy(new_root, root, sizeof(*new_root));
	new_root->root_key.objectid = new_root_objectid;
208 209 210 211 212 213 214

	WARN_ON(root->ref_cows && trans->transid !=
		root->fs_info->running_transaction->transid);
	WARN_ON(root->ref_cows && trans->transid != root->last_trans);

	level = btrfs_header_level(buf);
	nritems = btrfs_header_nritems(buf);
Z
Zheng Yan 已提交
215 216 217 218

	cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
				     new_root_objectid, trans->transid,
				     level, buf->start, 0);
219 220
	if (IS_ERR(cow)) {
		kfree(new_root);
221
		return PTR_ERR(cow);
222
	}
223 224 225 226 227

	copy_extent_buffer(cow, buf, 0, 0, cow->len);
	btrfs_set_header_bytenr(cow, cow->start);
	btrfs_set_header_generation(cow, trans->transid);
	btrfs_set_header_owner(cow, new_root_objectid);
228
	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
229

Y
Yan Zheng 已提交
230 231 232 233
	write_extent_buffer(cow, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(cow),
			    BTRFS_FSID_SIZE);

234
	WARN_ON(btrfs_header_generation(buf) > trans->transid);
Z
Zheng Yan 已提交
235
	ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
236 237
	kfree(new_root);

238 239 240 241 242 243 244 245
	if (ret)
		return ret;

	btrfs_mark_buffer_dirty(cow);
	*cow_ret = cow;
	return 0;
}

C
Chris Mason 已提交
246
/*
C
Chris Mason 已提交
247 248 249 250
 * does the dirty work in cow of a single block.  The parent block (if
 * supplied) is updated to point to the new cow copy.  The new buffer is marked
 * dirty and returned locked.  If you modify the block it needs to be marked
 * dirty again.
C
Chris Mason 已提交
251 252 253
 *
 * search_start -- an allocation hint for the new block
 *
C
Chris Mason 已提交
254 255 256
 * empty_size -- a hint that you plan on doing more cow.  This is the size in
 * bytes the allocator should try to find free next to the block it returns.
 * This is just a hint and may be ignored by the allocator.
C
Chris Mason 已提交
257 258
 *
 * prealloc_dest -- if you have already reserved a destination for the cow,
C
Chris Mason 已提交
259 260
 * this uses that block instead of allocating a new one.
 * btrfs_alloc_reserved_extent is used to finish the allocation.
C
Chris Mason 已提交
261
 */
C
Chris Mason 已提交
262
static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
263 264 265 266
			     struct btrfs_root *root,
			     struct extent_buffer *buf,
			     struct extent_buffer *parent, int parent_slot,
			     struct extent_buffer **cow_ret,
267 268
			     u64 search_start, u64 empty_size,
			     u64 prealloc_dest)
C
Chris Mason 已提交
269
{
Z
Zheng Yan 已提交
270
	u64 parent_start;
271
	struct extent_buffer *cow;
272
	u32 nritems;
273
	int ret = 0;
274
	int level;
275
	int unlock_orig = 0;
276

277 278 279
	if (*cow_ret == buf)
		unlock_orig = 1;

280
	btrfs_assert_tree_locked(buf);
281

Z
Zheng Yan 已提交
282 283 284 285 286
	if (parent)
		parent_start = parent->start;
	else
		parent_start = 0;

287 288
	WARN_ON(root->ref_cows && trans->transid !=
		root->fs_info->running_transaction->transid);
289
	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
290

291 292
	level = btrfs_header_level(buf);
	nritems = btrfs_header_nritems(buf);
Z
Zheng Yan 已提交
293

294 295 296 297 298 299 300
	if (prealloc_dest) {
		struct btrfs_key ins;

		ins.objectid = prealloc_dest;
		ins.offset = buf->len;
		ins.type = BTRFS_EXTENT_ITEM_KEY;

Z
Zheng Yan 已提交
301
		ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
302
						  root->root_key.objectid,
303
						  trans->transid, level, &ins);
304 305
		BUG_ON(ret);
		cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
306
					    buf->len, level);
307 308
	} else {
		cow = btrfs_alloc_free_block(trans, root, buf->len,
Z
Zheng Yan 已提交
309
					     parent_start,
310
					     root->root_key.objectid,
Z
Zheng Yan 已提交
311 312
					     trans->transid, level,
					     search_start, empty_size);
313
	}
314 315
	if (IS_ERR(cow))
		return PTR_ERR(cow);
316

317 318
	/* cow is set to blocking by btrfs_init_new_buffer */

319
	copy_extent_buffer(cow, buf, 0, 0, cow->len);
320
	btrfs_set_header_bytenr(cow, cow->start);
321 322
	btrfs_set_header_generation(cow, trans->transid);
	btrfs_set_header_owner(cow, root->root_key.objectid);
323
	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
324

Y
Yan Zheng 已提交
325 326 327 328
	write_extent_buffer(cow, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(cow),
			    BTRFS_FSID_SIZE);

329 330
	WARN_ON(btrfs_header_generation(buf) > trans->transid);
	if (btrfs_header_generation(buf) != trans->transid) {
Z
Zheng Yan 已提交
331 332
		u32 nr_extents;
		ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
333 334
		if (ret)
			return ret;
Z
Zheng Yan 已提交
335 336 337

		ret = btrfs_cache_ref(trans, root, buf, nr_extents);
		WARN_ON(ret);
Z
Zheng Yan 已提交
338 339 340 341
	} else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
		/*
		 * There are only two places that can drop reference to
		 * tree blocks owned by living reloc trees, one is here,
Y
Yan Zheng 已提交
342
		 * the other place is btrfs_drop_subtree. In both places,
Z
Zheng Yan 已提交
343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358
		 * we check reference count while tree block is locked.
		 * Furthermore, if reference count is one, it won't get
		 * increased by someone else.
		 */
		u32 refs;
		ret = btrfs_lookup_extent_ref(trans, root, buf->start,
					      buf->len, &refs);
		BUG_ON(ret);
		if (refs == 1) {
			ret = btrfs_update_ref(trans, root, buf, cow,
					       0, nritems);
			clean_tree_block(trans, root, buf);
		} else {
			ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
		}
		BUG_ON(ret);
359
	} else {
Z
Zheng Yan 已提交
360 361 362
		ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
		if (ret)
			return ret;
363 364 365
		clean_tree_block(trans, root, buf);
	}

Z
Zheng Yan 已提交
366 367 368 369 370
	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
		ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
		WARN_ON(ret);
	}

C
Chris Mason 已提交
371
	if (buf == root->node) {
372 373 374
		WARN_ON(parent && parent != buf);

		spin_lock(&root->node_lock);
C
Chris Mason 已提交
375
		root->node = cow;
376
		extent_buffer_get(cow);
377 378
		spin_unlock(&root->node_lock);

C
Chris Mason 已提交
379
		if (buf != root->commit_root) {
380
			btrfs_free_extent(trans, root, buf->start,
Z
Zheng Yan 已提交
381 382 383
					  buf->len, buf->start,
					  root->root_key.objectid,
					  btrfs_header_generation(buf),
384
					  level, 1);
C
Chris Mason 已提交
385
		}
386
		free_extent_buffer(buf);
387
		add_root_to_dirty_list(root);
C
Chris Mason 已提交
388
	} else {
389
		btrfs_set_node_blockptr(parent, parent_slot,
390
					cow->start);
391 392 393
		WARN_ON(trans->transid == 0);
		btrfs_set_node_ptr_generation(parent, parent_slot,
					      trans->transid);
C
Chris Mason 已提交
394
		btrfs_mark_buffer_dirty(parent);
395
		WARN_ON(btrfs_header_generation(parent) != trans->transid);
396
		btrfs_free_extent(trans, root, buf->start, buf->len,
Z
Zheng Yan 已提交
397
				  parent_start, btrfs_header_owner(parent),
398
				  btrfs_header_generation(parent), level, 1);
C
Chris Mason 已提交
399
	}
400 401
	if (unlock_orig)
		btrfs_tree_unlock(buf);
402
	free_extent_buffer(buf);
C
Chris Mason 已提交
403
	btrfs_mark_buffer_dirty(cow);
C
Chris Mason 已提交
404
	*cow_ret = cow;
C
Chris Mason 已提交
405 406 407
	return 0;
}

C
Chris Mason 已提交
408 409 410 411 412
/*
 * cows a single block, see __btrfs_cow_block for the real work.
 * This version of it has extra checks so that a block isn't cow'd more than
 * once per transaction, as long as it hasn't been written yet
 */
C
Chris Mason 已提交
413
noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
414 415
		    struct btrfs_root *root, struct extent_buffer *buf,
		    struct extent_buffer *parent, int parent_slot,
416
		    struct extent_buffer **cow_ret, u64 prealloc_dest)
417 418
{
	u64 search_start;
419
	int ret;
C
Chris Mason 已提交
420

421
	if (trans->transaction != root->fs_info->running_transaction) {
C
Chris Mason 已提交
422 423 424
		printk(KERN_CRIT "trans %llu running %llu\n",
		       (unsigned long long)trans->transid,
		       (unsigned long long)
425 426 427 428
		       root->fs_info->running_transaction->transid);
		WARN_ON(1);
	}
	if (trans->transid != root->fs_info->generation) {
C
Chris Mason 已提交
429 430 431
		printk(KERN_CRIT "trans %llu running %llu\n",
		       (unsigned long long)trans->transid,
		       (unsigned long long)root->fs_info->generation);
432 433
		WARN_ON(1);
	}
C
Chris Mason 已提交
434

435 436
	if (btrfs_header_generation(buf) == trans->transid &&
	    btrfs_header_owner(buf) == root->root_key.objectid &&
437
	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
438
		*cow_ret = buf;
439
		WARN_ON(prealloc_dest);
440 441
		return 0;
	}
442

443
	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
444 445 446 447 448

	if (parent)
		btrfs_set_lock_blocking(parent);
	btrfs_set_lock_blocking(buf);

449
	ret = __btrfs_cow_block(trans, root, buf, parent,
450 451
				 parent_slot, cow_ret, search_start, 0,
				 prealloc_dest);
452
	return ret;
453 454
}

C
Chris Mason 已提交
455 456 457 458
/*
 * helper function for defrag to decide if two blocks pointed to by a
 * node are actually close by
 */
459
static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
460
{
461
	if (blocknr < other && other - (blocknr + blocksize) < 32768)
462
		return 1;
463
	if (blocknr > other && blocknr - (other + blocksize) < 32768)
464 465 466 467
		return 1;
	return 0;
}

468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491
/*
 * compare two keys in a memcmp fashion
 */
static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
{
	struct btrfs_key k1;

	btrfs_disk_key_to_cpu(&k1, disk);

	if (k1.objectid > k2->objectid)
		return 1;
	if (k1.objectid < k2->objectid)
		return -1;
	if (k1.type > k2->type)
		return 1;
	if (k1.type < k2->type)
		return -1;
	if (k1.offset > k2->offset)
		return 1;
	if (k1.offset < k2->offset)
		return -1;
	return 0;
}

492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510
/*
 * same as comp_keys only with two btrfs_key's
 */
static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
{
	if (k1->objectid > k2->objectid)
		return 1;
	if (k1->objectid < k2->objectid)
		return -1;
	if (k1->type > k2->type)
		return 1;
	if (k1->type < k2->type)
		return -1;
	if (k1->offset > k2->offset)
		return 1;
	if (k1->offset < k2->offset)
		return -1;
	return 0;
}
511

C
Chris Mason 已提交
512 513 514 515 516
/*
 * this is used by the defrag code to go through all the
 * leaves pointed to by a node and reallocate them so that
 * disk order is close to key order
 */
517
int btrfs_realloc_node(struct btrfs_trans_handle *trans,
518
		       struct btrfs_root *root, struct extent_buffer *parent,
519 520
		       int start_slot, int cache_only, u64 *last_ret,
		       struct btrfs_key *progress)
521
{
522
	struct extent_buffer *cur;
523
	u64 blocknr;
524
	u64 gen;
525 526
	u64 search_start = *last_ret;
	u64 last_block = 0;
527 528 529 530 531
	u64 other;
	u32 parent_nritems;
	int end_slot;
	int i;
	int err = 0;
532
	int parent_level;
533 534
	int uptodate;
	u32 blocksize;
535 536
	int progress_passed = 0;
	struct btrfs_disk_key disk_key;
537

538 539 540 541
	parent_level = btrfs_header_level(parent);
	if (cache_only && parent_level != 1)
		return 0;

C
Chris Mason 已提交
542
	if (trans->transaction != root->fs_info->running_transaction)
543
		WARN_ON(1);
C
Chris Mason 已提交
544
	if (trans->transid != root->fs_info->generation)
545
		WARN_ON(1);
546

547 548
	parent_nritems = btrfs_header_nritems(parent);
	blocksize = btrfs_level_size(root, parent_level - 1);
549 550 551 552 553
	end_slot = parent_nritems;

	if (parent_nritems == 1)
		return 0;

554 555
	btrfs_set_lock_blocking(parent);

556 557
	for (i = start_slot; i < end_slot; i++) {
		int close = 1;
558

559 560 561 562 563 564 565 566
		if (!parent->map_token) {
			map_extent_buffer(parent,
					btrfs_node_key_ptr_offset(i),
					sizeof(struct btrfs_key_ptr),
					&parent->map_token, &parent->kaddr,
					&parent->map_start, &parent->map_len,
					KM_USER1);
		}
567 568 569 570 571
		btrfs_node_key(parent, &disk_key, i);
		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
			continue;

		progress_passed = 1;
572
		blocknr = btrfs_node_blockptr(parent, i);
573
		gen = btrfs_node_ptr_generation(parent, i);
574 575
		if (last_block == 0)
			last_block = blocknr;
576

577
		if (i > 0) {
578 579
			other = btrfs_node_blockptr(parent, i - 1);
			close = close_blocks(blocknr, other, blocksize);
580
		}
C
Chris Mason 已提交
581
		if (!close && i < end_slot - 2) {
582 583
			other = btrfs_node_blockptr(parent, i + 1);
			close = close_blocks(blocknr, other, blocksize);
584
		}
585 586
		if (close) {
			last_block = blocknr;
587
			continue;
588
		}
589 590 591 592 593
		if (parent->map_token) {
			unmap_extent_buffer(parent, parent->map_token,
					    KM_USER1);
			parent->map_token = NULL;
		}
594

595 596
		cur = btrfs_find_tree_block(root, blocknr, blocksize);
		if (cur)
597
			uptodate = btrfs_buffer_uptodate(cur, gen);
598 599
		else
			uptodate = 0;
600
		if (!cur || !uptodate) {
601
			if (cache_only) {
602
				free_extent_buffer(cur);
603 604
				continue;
			}
605 606
			if (!cur) {
				cur = read_tree_block(root, blocknr,
607
							 blocksize, gen);
608
			} else if (!uptodate) {
609
				btrfs_read_buffer(cur, gen);
610
			}
611
		}
612
		if (search_start == 0)
613
			search_start = last_block;
614

615
		btrfs_tree_lock(cur);
616
		btrfs_set_lock_blocking(cur);
617
		err = __btrfs_cow_block(trans, root, cur, parent, i,
618
					&cur, search_start,
619
					min(16 * blocksize,
620
					    (end_slot - i) * blocksize), 0);
Y
Yan 已提交
621
		if (err) {
622
			btrfs_tree_unlock(cur);
623
			free_extent_buffer(cur);
624
			break;
Y
Yan 已提交
625
		}
626 627
		search_start = cur->start;
		last_block = cur->start;
628
		*last_ret = search_start;
629 630
		btrfs_tree_unlock(cur);
		free_extent_buffer(cur);
631
	}
632 633 634 635 636
	if (parent->map_token) {
		unmap_extent_buffer(parent, parent->map_token,
				    KM_USER1);
		parent->map_token = NULL;
	}
637 638 639
	return err;
}

C
Chris Mason 已提交
640 641 642 643 644
/*
 * The leaf data grows from end-to-front in the node.
 * this returns the address of the start of the last item,
 * which is the stop of the leaf data stack
 */
C
Chris Mason 已提交
645
static inline unsigned int leaf_data_end(struct btrfs_root *root,
646
					 struct extent_buffer *leaf)
647
{
648
	u32 nr = btrfs_header_nritems(leaf);
649
	if (nr == 0)
C
Chris Mason 已提交
650
		return BTRFS_LEAF_DATA_SIZE(root);
651
	return btrfs_item_offset_nr(leaf, nr - 1);
652 653
}

C
Chris Mason 已提交
654 655 656 657
/*
 * extra debugging checks to make sure all the items in a key are
 * well formed and in the proper order
 */
C
Chris Mason 已提交
658 659
static int check_node(struct btrfs_root *root, struct btrfs_path *path,
		      int level)
C
Chris Mason 已提交
660
{
661 662 663 664
	struct extent_buffer *parent = NULL;
	struct extent_buffer *node = path->nodes[level];
	struct btrfs_disk_key parent_key;
	struct btrfs_disk_key node_key;
C
Chris Mason 已提交
665
	int parent_slot;
666 667
	int slot;
	struct btrfs_key cpukey;
668
	u32 nritems = btrfs_header_nritems(node);
C
Chris Mason 已提交
669 670

	if (path->nodes[level + 1])
671
		parent = path->nodes[level + 1];
A
Aneesh 已提交
672

673
	slot = path->slots[level];
674 675
	BUG_ON(nritems == 0);
	if (parent) {
A
Aneesh 已提交
676
		parent_slot = path->slots[level + 1];
677 678 679
		btrfs_node_key(parent, &parent_key, parent_slot);
		btrfs_node_key(node, &node_key, 0);
		BUG_ON(memcmp(&parent_key, &node_key,
C
Chris Mason 已提交
680
			      sizeof(struct btrfs_disk_key)));
681
		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
682
		       btrfs_header_bytenr(node));
C
Chris Mason 已提交
683
	}
C
Chris Mason 已提交
684
	BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
685
	if (slot != 0) {
686 687 688
		btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
		btrfs_node_key(node, &node_key, slot);
		BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
689 690
	}
	if (slot < nritems - 1) {
691 692 693
		btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
		btrfs_node_key(node, &node_key, slot);
		BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
C
Chris Mason 已提交
694 695 696 697
	}
	return 0;
}

C
Chris Mason 已提交
698 699 700 701
/*
 * extra checking to make sure all the items in a leaf are
 * well formed and in the proper order
 */
C
Chris Mason 已提交
702 703
static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
		      int level)
C
Chris Mason 已提交
704
{
705 706
	struct extent_buffer *leaf = path->nodes[level];
	struct extent_buffer *parent = NULL;
C
Chris Mason 已提交
707
	int parent_slot;
708
	struct btrfs_key cpukey;
709 710 711
	struct btrfs_disk_key parent_key;
	struct btrfs_disk_key leaf_key;
	int slot = path->slots[0];
712

713
	u32 nritems = btrfs_header_nritems(leaf);
C
Chris Mason 已提交
714 715

	if (path->nodes[level + 1])
716
		parent = path->nodes[level + 1];
717 718 719 720 721

	if (nritems == 0)
		return 0;

	if (parent) {
A
Aneesh 已提交
722
		parent_slot = path->slots[level + 1];
723 724
		btrfs_node_key(parent, &parent_key, parent_slot);
		btrfs_item_key(leaf, &leaf_key, 0);
725

726
		BUG_ON(memcmp(&parent_key, &leaf_key,
C
Chris Mason 已提交
727
		       sizeof(struct btrfs_disk_key)));
728
		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
729
		       btrfs_header_bytenr(leaf));
730 731 732 733 734 735
	}
	if (slot != 0 && slot < nritems - 1) {
		btrfs_item_key(leaf, &leaf_key, slot);
		btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
		if (comp_keys(&leaf_key, &cpukey) <= 0) {
			btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
736
			printk(KERN_CRIT "slot %d offset bad key\n", slot);
737 738 739 740 741
			BUG_ON(1);
		}
		if (btrfs_item_offset_nr(leaf, slot - 1) !=
		       btrfs_item_end_nr(leaf, slot)) {
			btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
742
			printk(KERN_CRIT "slot %d offset bad\n", slot);
743 744
			BUG_ON(1);
		}
745 746
	}
	if (slot < nritems - 1) {
747 748 749 750 751 752
		btrfs_item_key(leaf, &leaf_key, slot);
		btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
		BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
		if (btrfs_item_offset_nr(leaf, slot) !=
			btrfs_item_end_nr(leaf, slot + 1)) {
			btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
753
			printk(KERN_CRIT "slot %d offset bad\n", slot);
754 755
			BUG_ON(1);
		}
C
Chris Mason 已提交
756
	}
757 758
	BUG_ON(btrfs_item_offset_nr(leaf, 0) +
	       btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
C
Chris Mason 已提交
759 760 761
	return 0;
}

C
Chris Mason 已提交
762
static noinline int check_block(struct btrfs_root *root,
763
				struct btrfs_path *path, int level)
C
Chris Mason 已提交
764
{
765
	return 0;
C
Chris Mason 已提交
766
	if (level == 0)
C
Chris Mason 已提交
767 768
		return check_leaf(root, path, level);
	return check_node(root, path, level);
C
Chris Mason 已提交
769 770
}

C
Chris Mason 已提交
771
/*
772 773 774
 * search for key in the extent_buffer.  The items start at offset p,
 * and they are item_size apart.  There are 'max' items in p.
 *
C
Chris Mason 已提交
775 776 777 778 779 780
 * the slot in the array is returned via slot, and it points to
 * the place where you would insert key if it is not found in
 * the array.
 *
 * slot may point to max if the key is bigger than all of the keys
 */
781 782 783 784
static noinline int generic_bin_search(struct extent_buffer *eb,
				       unsigned long p,
				       int item_size, struct btrfs_key *key,
				       int max, int *slot)
785 786 787 788 789
{
	int low = 0;
	int high = max;
	int mid;
	int ret;
790
	struct btrfs_disk_key *tmp = NULL;
791 792 793 794 795 796
	struct btrfs_disk_key unaligned;
	unsigned long offset;
	char *map_token = NULL;
	char *kaddr = NULL;
	unsigned long map_start = 0;
	unsigned long map_len = 0;
797
	int err;
798

C
Chris Mason 已提交
799
	while (low < high) {
800
		mid = (low + high) / 2;
801 802 803 804 805
		offset = p + mid * item_size;

		if (!map_token || offset < map_start ||
		    (offset + sizeof(struct btrfs_disk_key)) >
		    map_start + map_len) {
806
			if (map_token) {
807
				unmap_extent_buffer(eb, map_token, KM_USER0);
808 809
				map_token = NULL;
			}
810 811

			err = map_private_extent_buffer(eb, offset,
812 813 814 815 816 817 818 819 820 821 822 823
						sizeof(struct btrfs_disk_key),
						&map_token, &kaddr,
						&map_start, &map_len, KM_USER0);

			if (!err) {
				tmp = (struct btrfs_disk_key *)(kaddr + offset -
							map_start);
			} else {
				read_extent_buffer(eb, &unaligned,
						   offset, sizeof(unaligned));
				tmp = &unaligned;
			}
824 825 826 827 828

		} else {
			tmp = (struct btrfs_disk_key *)(kaddr + offset -
							map_start);
		}
829 830 831 832 833 834 835 836
		ret = comp_keys(tmp, key);

		if (ret < 0)
			low = mid + 1;
		else if (ret > 0)
			high = mid;
		else {
			*slot = mid;
837 838
			if (map_token)
				unmap_extent_buffer(eb, map_token, KM_USER0);
839 840 841 842
			return 0;
		}
	}
	*slot = low;
843 844
	if (map_token)
		unmap_extent_buffer(eb, map_token, KM_USER0);
845 846 847
	return 1;
}

C
Chris Mason 已提交
848 849 850 851
/*
 * simple bin_search frontend that does the right thing for
 * leaves vs nodes
 */
852 853
static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
		      int level, int *slot)
854
{
855 856 857
	if (level == 0) {
		return generic_bin_search(eb,
					  offsetof(struct btrfs_leaf, items),
C
Chris Mason 已提交
858
					  sizeof(struct btrfs_item),
859
					  key, btrfs_header_nritems(eb),
860
					  slot);
861
	} else {
862 863
		return generic_bin_search(eb,
					  offsetof(struct btrfs_node, ptrs),
C
Chris Mason 已提交
864
					  sizeof(struct btrfs_key_ptr),
865
					  key, btrfs_header_nritems(eb),
866
					  slot);
867 868 869 870
	}
	return -1;
}

C
Chris Mason 已提交
871 872 873 874
/* given a node and slot number, this reads the blocks it points to.  The
 * extent buffer is returned with a reference taken (but unlocked).
 * NULL is returned on error.
 */
875
static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
876
				   struct extent_buffer *parent, int slot)
877
{
878
	int level = btrfs_header_level(parent);
879 880
	if (slot < 0)
		return NULL;
881
	if (slot >= btrfs_header_nritems(parent))
882
		return NULL;
883 884 885

	BUG_ON(level == 0);

886
	return read_tree_block(root, btrfs_node_blockptr(parent, slot),
887 888
		       btrfs_level_size(root, level - 1),
		       btrfs_node_ptr_generation(parent, slot));
889 890
}

C
Chris Mason 已提交
891 892 893 894 895
/*
 * node level balancing, used to make sure nodes are in proper order for
 * item deletion.  We balance from the top down, so we have to make sure
 * that a deletion won't leave an node completely empty later on.
 */
896
static noinline int balance_level(struct btrfs_trans_handle *trans,
897 898
			 struct btrfs_root *root,
			 struct btrfs_path *path, int level)
899
{
900 901 902 903
	struct extent_buffer *right = NULL;
	struct extent_buffer *mid;
	struct extent_buffer *left = NULL;
	struct extent_buffer *parent = NULL;
904 905 906 907
	int ret = 0;
	int wret;
	int pslot;
	int orig_slot = path->slots[level];
908
	int err_on_enospc = 0;
909
	u64 orig_ptr;
910 911 912 913

	if (level == 0)
		return 0;

914
	mid = path->nodes[level];
915

916
	WARN_ON(!path->locks[level]);
917 918
	WARN_ON(btrfs_header_generation(mid) != trans->transid);

919
	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
920

C
Chris Mason 已提交
921
	if (level < BTRFS_MAX_LEVEL - 1)
922
		parent = path->nodes[level + 1];
923 924
	pslot = path->slots[level + 1];

C
Chris Mason 已提交
925 926 927 928
	/*
	 * deal with the case where there is only one pointer in the root
	 * by promoting the node below to a root
	 */
929 930
	if (!parent) {
		struct extent_buffer *child;
931

932
		if (btrfs_header_nritems(mid) != 1)
933 934 935
			return 0;

		/* promote the child to a root */
936
		child = read_node_slot(root, mid, 0);
937
		BUG_ON(!child);
938
		btrfs_tree_lock(child);
939
		btrfs_set_lock_blocking(child);
940
		ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
941 942
		BUG_ON(ret);

943
		spin_lock(&root->node_lock);
944
		root->node = child;
945 946
		spin_unlock(&root->node_lock);

Z
Zheng Yan 已提交
947 948 949
		ret = btrfs_update_extent_ref(trans, root, child->start,
					      mid->start, child->start,
					      root->root_key.objectid,
950
					      trans->transid, level - 1);
Z
Zheng Yan 已提交
951 952
		BUG_ON(ret);

953
		add_root_to_dirty_list(root);
954
		btrfs_tree_unlock(child);
955

956
		path->locks[level] = 0;
957
		path->nodes[level] = NULL;
958
		clean_tree_block(trans, root, mid);
959
		btrfs_tree_unlock(mid);
960
		/* once for the path */
961
		free_extent_buffer(mid);
962
		ret = btrfs_free_extent(trans, root, mid->start, mid->len,
Z
Zheng Yan 已提交
963
					mid->start, root->root_key.objectid,
964 965
					btrfs_header_generation(mid),
					level, 1);
966
		/* once for the root ptr */
967
		free_extent_buffer(mid);
968
		return ret;
969
	}
970
	if (btrfs_header_nritems(mid) >
C
Chris Mason 已提交
971
	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
972 973
		return 0;

974
	if (btrfs_header_nritems(mid) < 2)
975 976
		err_on_enospc = 1;

977 978
	left = read_node_slot(root, parent, pslot - 1);
	if (left) {
979
		btrfs_tree_lock(left);
980
		btrfs_set_lock_blocking(left);
981
		wret = btrfs_cow_block(trans, root, left,
982
				       parent, pslot - 1, &left, 0);
983 984 985 986
		if (wret) {
			ret = wret;
			goto enospc;
		}
987
	}
988 989
	right = read_node_slot(root, parent, pslot + 1);
	if (right) {
990
		btrfs_tree_lock(right);
991
		btrfs_set_lock_blocking(right);
992
		wret = btrfs_cow_block(trans, root, right,
993
				       parent, pslot + 1, &right, 0);
994 995 996 997 998 999 1000
		if (wret) {
			ret = wret;
			goto enospc;
		}
	}

	/* first, try to make some room in the middle buffer */
1001 1002
	if (left) {
		orig_slot += btrfs_header_nritems(left);
1003
		wret = push_node_left(trans, root, left, mid, 1);
1004 1005
		if (wret < 0)
			ret = wret;
1006
		if (btrfs_header_nritems(mid) < 2)
1007
			err_on_enospc = 1;
1008
	}
1009 1010 1011 1012

	/*
	 * then try to empty the right most buffer into the middle
	 */
1013
	if (right) {
1014
		wret = push_node_left(trans, root, mid, right, 1);
1015
		if (wret < 0 && wret != -ENOSPC)
1016
			ret = wret;
1017
		if (btrfs_header_nritems(right) == 0) {
1018
			u64 bytenr = right->start;
1019
			u64 generation = btrfs_header_generation(parent);
1020 1021
			u32 blocksize = right->len;

1022
			clean_tree_block(trans, root, right);
1023
			btrfs_tree_unlock(right);
1024
			free_extent_buffer(right);
1025
			right = NULL;
1026 1027
			wret = del_ptr(trans, root, path, level + 1, pslot +
				       1);
1028 1029
			if (wret)
				ret = wret;
1030
			wret = btrfs_free_extent(trans, root, bytenr,
Z
Zheng Yan 已提交
1031
						 blocksize, parent->start,
1032
						 btrfs_header_owner(parent),
1033
						 generation, level, 1);
1034 1035 1036
			if (wret)
				ret = wret;
		} else {
1037 1038 1039 1040
			struct btrfs_disk_key right_key;
			btrfs_node_key(right, &right_key, 0);
			btrfs_set_node_key(parent, &right_key, pslot + 1);
			btrfs_mark_buffer_dirty(parent);
1041 1042
		}
	}
1043
	if (btrfs_header_nritems(mid) == 1) {
1044 1045 1046 1047 1048 1049 1050 1051 1052
		/*
		 * we're not allowed to leave a node with one item in the
		 * tree during a delete.  A deletion from lower in the tree
		 * could try to delete the only pointer in this node.
		 * So, pull some keys from the left.
		 * There has to be a left pointer at this point because
		 * otherwise we would have pulled some pointers from the
		 * right
		 */
1053 1054
		BUG_ON(!left);
		wret = balance_node_right(trans, root, mid, left);
1055
		if (wret < 0) {
1056
			ret = wret;
1057 1058
			goto enospc;
		}
1059 1060 1061 1062 1063
		if (wret == 1) {
			wret = push_node_left(trans, root, left, mid, 1);
			if (wret < 0)
				ret = wret;
		}
1064 1065
		BUG_ON(wret == 1);
	}
1066
	if (btrfs_header_nritems(mid) == 0) {
1067
		/* we've managed to empty the middle node, drop it */
1068
		u64 root_gen = btrfs_header_generation(parent);
1069 1070
		u64 bytenr = mid->start;
		u32 blocksize = mid->len;
1071

1072
		clean_tree_block(trans, root, mid);
1073
		btrfs_tree_unlock(mid);
1074
		free_extent_buffer(mid);
1075
		mid = NULL;
1076
		wret = del_ptr(trans, root, path, level + 1, pslot);
1077 1078
		if (wret)
			ret = wret;
1079
		wret = btrfs_free_extent(trans, root, bytenr, blocksize,
Z
Zheng Yan 已提交
1080
					 parent->start,
1081
					 btrfs_header_owner(parent),
1082
					 root_gen, level, 1);
1083 1084
		if (wret)
			ret = wret;
1085 1086
	} else {
		/* update the parent key to reflect our changes */
1087 1088 1089 1090
		struct btrfs_disk_key mid_key;
		btrfs_node_key(mid, &mid_key, 0);
		btrfs_set_node_key(parent, &mid_key, pslot);
		btrfs_mark_buffer_dirty(parent);
1091
	}
1092

1093
	/* update the path */
1094 1095 1096
	if (left) {
		if (btrfs_header_nritems(left) > orig_slot) {
			extent_buffer_get(left);
1097
			/* left was locked after cow */
1098
			path->nodes[level] = left;
1099 1100
			path->slots[level + 1] -= 1;
			path->slots[level] = orig_slot;
1101 1102
			if (mid) {
				btrfs_tree_unlock(mid);
1103
				free_extent_buffer(mid);
1104
			}
1105
		} else {
1106
			orig_slot -= btrfs_header_nritems(left);
1107 1108 1109
			path->slots[level] = orig_slot;
		}
	}
1110
	/* double check we haven't messed things up */
C
Chris Mason 已提交
1111
	check_block(root, path, level);
C
Chris Mason 已提交
1112
	if (orig_ptr !=
1113
	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1114
		BUG();
1115
enospc:
1116 1117
	if (right) {
		btrfs_tree_unlock(right);
1118
		free_extent_buffer(right);
1119 1120 1121 1122
	}
	if (left) {
		if (path->nodes[level] != left)
			btrfs_tree_unlock(left);
1123
		free_extent_buffer(left);
1124
	}
1125 1126 1127
	return ret;
}

C
Chris Mason 已提交
1128 1129 1130 1131
/* Node balancing for insertion.  Here we only split or push nodes around
 * when they are completely full.  This is also done top down, so we
 * have to be pessimistic.
 */
C
Chris Mason 已提交
1132
static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1133 1134
					  struct btrfs_root *root,
					  struct btrfs_path *path, int level)
1135
{
1136 1137 1138 1139
	struct extent_buffer *right = NULL;
	struct extent_buffer *mid;
	struct extent_buffer *left = NULL;
	struct extent_buffer *parent = NULL;
1140 1141 1142 1143 1144 1145 1146 1147 1148
	int ret = 0;
	int wret;
	int pslot;
	int orig_slot = path->slots[level];
	u64 orig_ptr;

	if (level == 0)
		return 1;

1149
	mid = path->nodes[level];
1150
	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1151 1152 1153
	orig_ptr = btrfs_node_blockptr(mid, orig_slot);

	if (level < BTRFS_MAX_LEVEL - 1)
1154
		parent = path->nodes[level + 1];
1155 1156
	pslot = path->slots[level + 1];

1157
	if (!parent)
1158 1159
		return 1;

1160
	left = read_node_slot(root, parent, pslot - 1);
1161 1162

	/* first, try to make some room in the middle buffer */
1163
	if (left) {
1164
		u32 left_nr;
1165 1166

		btrfs_tree_lock(left);
1167 1168
		btrfs_set_lock_blocking(left);

1169
		left_nr = btrfs_header_nritems(left);
C
Chris Mason 已提交
1170 1171 1172
		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
			wret = 1;
		} else {
1173
			ret = btrfs_cow_block(trans, root, left, parent,
1174
					      pslot - 1, &left, 0);
1175 1176 1177 1178
			if (ret)
				wret = 1;
			else {
				wret = push_node_left(trans, root,
1179
						      left, mid, 0);
1180
			}
C
Chris Mason 已提交
1181
		}
1182 1183 1184
		if (wret < 0)
			ret = wret;
		if (wret == 0) {
1185
			struct btrfs_disk_key disk_key;
1186
			orig_slot += left_nr;
1187 1188 1189 1190 1191
			btrfs_node_key(mid, &disk_key, 0);
			btrfs_set_node_key(parent, &disk_key, pslot);
			btrfs_mark_buffer_dirty(parent);
			if (btrfs_header_nritems(left) > orig_slot) {
				path->nodes[level] = left;
1192 1193
				path->slots[level + 1] -= 1;
				path->slots[level] = orig_slot;
1194
				btrfs_tree_unlock(mid);
1195
				free_extent_buffer(mid);
1196 1197
			} else {
				orig_slot -=
1198
					btrfs_header_nritems(left);
1199
				path->slots[level] = orig_slot;
1200
				btrfs_tree_unlock(left);
1201
				free_extent_buffer(left);
1202 1203 1204
			}
			return 0;
		}
1205
		btrfs_tree_unlock(left);
1206
		free_extent_buffer(left);
1207
	}
1208
	right = read_node_slot(root, parent, pslot + 1);
1209 1210 1211 1212

	/*
	 * then try to empty the right most buffer into the middle
	 */
1213
	if (right) {
C
Chris Mason 已提交
1214
		u32 right_nr;
1215

1216
		btrfs_tree_lock(right);
1217 1218
		btrfs_set_lock_blocking(right);

1219
		right_nr = btrfs_header_nritems(right);
C
Chris Mason 已提交
1220 1221 1222
		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
			wret = 1;
		} else {
1223 1224
			ret = btrfs_cow_block(trans, root, right,
					      parent, pslot + 1,
1225
					      &right, 0);
1226 1227 1228 1229
			if (ret)
				wret = 1;
			else {
				wret = balance_node_right(trans, root,
1230
							  right, mid);
1231
			}
C
Chris Mason 已提交
1232
		}
1233 1234 1235
		if (wret < 0)
			ret = wret;
		if (wret == 0) {
1236 1237 1238 1239 1240 1241 1242 1243
			struct btrfs_disk_key disk_key;

			btrfs_node_key(right, &disk_key, 0);
			btrfs_set_node_key(parent, &disk_key, pslot + 1);
			btrfs_mark_buffer_dirty(parent);

			if (btrfs_header_nritems(mid) <= orig_slot) {
				path->nodes[level] = right;
1244 1245
				path->slots[level + 1] += 1;
				path->slots[level] = orig_slot -
1246
					btrfs_header_nritems(mid);
1247
				btrfs_tree_unlock(mid);
1248
				free_extent_buffer(mid);
1249
			} else {
1250
				btrfs_tree_unlock(right);
1251
				free_extent_buffer(right);
1252 1253 1254
			}
			return 0;
		}
1255
		btrfs_tree_unlock(right);
1256
		free_extent_buffer(right);
1257 1258 1259 1260
	}
	return 1;
}

1261
/*
C
Chris Mason 已提交
1262 1263
 * readahead one full node of leaves, finding things that are close
 * to the block in 'slot', and triggering ra on them.
1264
 */
1265 1266 1267
static noinline void reada_for_search(struct btrfs_root *root,
				      struct btrfs_path *path,
				      int level, int slot, u64 objectid)
1268
{
1269
	struct extent_buffer *node;
1270
	struct btrfs_disk_key disk_key;
1271 1272
	u32 nritems;
	u64 search;
1273
	u64 target;
1274
	u64 nread = 0;
1275
	int direction = path->reada;
1276
	struct extent_buffer *eb;
1277 1278 1279
	u32 nr;
	u32 blocksize;
	u32 nscan = 0;
1280

1281
	if (level != 1)
1282 1283 1284
		return;

	if (!path->nodes[level])
1285 1286
		return;

1287
	node = path->nodes[level];
1288

1289
	search = btrfs_node_blockptr(node, slot);
1290 1291
	blocksize = btrfs_level_size(root, level - 1);
	eb = btrfs_find_tree_block(root, search, blocksize);
1292 1293
	if (eb) {
		free_extent_buffer(eb);
1294 1295 1296
		return;
	}

1297
	target = search;
1298

1299
	nritems = btrfs_header_nritems(node);
1300
	nr = slot;
C
Chris Mason 已提交
1301
	while (1) {
1302 1303 1304 1305 1306 1307 1308 1309
		if (direction < 0) {
			if (nr == 0)
				break;
			nr--;
		} else if (direction > 0) {
			nr++;
			if (nr >= nritems)
				break;
1310
		}
1311 1312 1313 1314 1315
		if (path->reada < 0 && objectid) {
			btrfs_node_key(node, &disk_key, nr);
			if (btrfs_disk_key_objectid(&disk_key) != objectid)
				break;
		}
1316
		search = btrfs_node_blockptr(node, nr);
1317 1318
		if ((search <= target && target - search <= 65536) ||
		    (search > target && search - target <= 65536)) {
1319 1320
			readahead_tree_block(root, search, blocksize,
				     btrfs_node_ptr_generation(node, nr));
1321 1322 1323
			nread += blocksize;
		}
		nscan++;
1324
		if ((nread > 65536 || nscan > 32))
1325
			break;
1326 1327
	}
}
1328

1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390
/*
 * returns -EAGAIN if it had to drop the path, or zero if everything was in
 * cache
 */
static noinline int reada_for_balance(struct btrfs_root *root,
				      struct btrfs_path *path, int level)
{
	int slot;
	int nritems;
	struct extent_buffer *parent;
	struct extent_buffer *eb;
	u64 gen;
	u64 block1 = 0;
	u64 block2 = 0;
	int ret = 0;
	int blocksize;

	parent = path->nodes[level - 1];
	if (!parent)
		return 0;

	nritems = btrfs_header_nritems(parent);
	slot = path->slots[level];
	blocksize = btrfs_level_size(root, level);

	if (slot > 0) {
		block1 = btrfs_node_blockptr(parent, slot - 1);
		gen = btrfs_node_ptr_generation(parent, slot - 1);
		eb = btrfs_find_tree_block(root, block1, blocksize);
		if (eb && btrfs_buffer_uptodate(eb, gen))
			block1 = 0;
		free_extent_buffer(eb);
	}
	if (slot < nritems) {
		block2 = btrfs_node_blockptr(parent, slot + 1);
		gen = btrfs_node_ptr_generation(parent, slot + 1);
		eb = btrfs_find_tree_block(root, block2, blocksize);
		if (eb && btrfs_buffer_uptodate(eb, gen))
			block2 = 0;
		free_extent_buffer(eb);
	}
	if (block1 || block2) {
		ret = -EAGAIN;
		btrfs_release_path(root, path);
		if (block1)
			readahead_tree_block(root, block1, blocksize, 0);
		if (block2)
			readahead_tree_block(root, block2, blocksize, 0);

		if (block1) {
			eb = read_tree_block(root, block1, blocksize, 0);
			free_extent_buffer(eb);
		}
		if (block1) {
			eb = read_tree_block(root, block2, blocksize, 0);
			free_extent_buffer(eb);
		}
	}
	return ret;
}


C
Chris Mason 已提交
1391
/*
C
Chris Mason 已提交
1392 1393 1394 1395
 * when we walk down the tree, it is usually safe to unlock the higher layers
 * in the tree.  The exceptions are when our path goes through slot 0, because
 * operations on the tree might require changing key pointers higher up in the
 * tree.
C
Chris Mason 已提交
1396
 *
C
Chris Mason 已提交
1397 1398 1399
 * callers might also have set path->keep_locks, which tells this code to keep
 * the lock if the path points to the last slot in the block.  This is part of
 * walking through the tree, and selecting the next slot in the higher block.
C
Chris Mason 已提交
1400
 *
C
Chris Mason 已提交
1401 1402
 * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
 * if lowest_unlock is 1, level 0 won't be unlocked
C
Chris Mason 已提交
1403
 */
1404 1405
static noinline void unlock_up(struct btrfs_path *path, int level,
			       int lowest_unlock)
1406 1407 1408
{
	int i;
	int skip_level = level;
1409
	int no_skips = 0;
1410 1411 1412 1413 1414 1415 1416
	struct extent_buffer *t;

	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
		if (!path->nodes[i])
			break;
		if (!path->locks[i])
			break;
1417
		if (!no_skips && path->slots[i] == 0) {
1418 1419 1420
			skip_level = i + 1;
			continue;
		}
1421
		if (!no_skips && path->keep_locks) {
1422 1423 1424
			u32 nritems;
			t = path->nodes[i];
			nritems = btrfs_header_nritems(t);
1425
			if (nritems < 1 || path->slots[i] >= nritems - 1) {
1426 1427 1428 1429
				skip_level = i + 1;
				continue;
			}
		}
1430 1431 1432
		if (skip_level < i && i >= lowest_unlock)
			no_skips = 1;

1433 1434 1435 1436 1437 1438 1439 1440
		t = path->nodes[i];
		if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
			btrfs_tree_unlock(t);
			path->locks[i] = 0;
		}
	}
}

1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458
/*
 * This releases any locks held in the path starting at level and
 * going all the way up to the root.
 *
 * btrfs_search_slot will keep the lock held on higher nodes in a few
 * corner cases, such as COW of the block at slot zero in the node.  This
 * ignores those rules, and it should only be called when there are no
 * more updates to be done higher up in the tree.
 */
noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
{
	int i;

	if (path->keep_locks || path->lowest_level)
		return;

	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
		if (!path->nodes[i])
1459
			continue;
1460
		if (!path->locks[i])
1461
			continue;
1462 1463 1464 1465 1466
		btrfs_tree_unlock(path->nodes[i]);
		path->locks[i] = 0;
	}
}

C
Chris Mason 已提交
1467 1468 1469 1470 1471 1472
/*
 * look for key in the tree.  path is filled in with nodes along the way
 * if key is found, we return zero and you can find the item in the leaf
 * level of the path (level 0)
 *
 * If the key isn't found, the path points to the slot where it should
C
Chris Mason 已提交
1473 1474
 * be inserted, and 1 is returned.  If there are other errors during the
 * search a negative error number is returned.
C
Chris Mason 已提交
1475 1476 1477 1478
 *
 * if ins_len > 0, nodes and leaves will be split as we walk down the
 * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
 * possible)
C
Chris Mason 已提交
1479
 */
1480 1481 1482
int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
		      *root, struct btrfs_key *key, struct btrfs_path *p, int
		      ins_len, int cow)
1483
{
1484
	struct extent_buffer *b;
1485
	struct extent_buffer *tmp;
1486 1487 1488
	int slot;
	int ret;
	int level;
1489
	int should_reada = p->reada;
1490
	int lowest_unlock = 1;
1491
	int blocksize;
1492
	u8 lowest_level = 0;
1493 1494
	u64 blocknr;
	u64 gen;
1495
	struct btrfs_key prealloc_block;
1496

1497
	lowest_level = p->lowest_level;
1498
	WARN_ON(lowest_level && ins_len > 0);
C
Chris Mason 已提交
1499
	WARN_ON(p->nodes[0] != NULL);
1500

1501 1502
	if (ins_len < 0)
		lowest_unlock = 2;
1503 1504 1505

	prealloc_block.objectid = 0;

1506
again:
1507 1508 1509 1510
	if (p->skip_locking)
		b = btrfs_root_node(root);
	else
		b = btrfs_lock_root_node(root);
1511

1512
	while (b) {
1513
		level = btrfs_header_level(b);
1514 1515 1516 1517 1518 1519 1520 1521 1522

		/*
		 * setup the path here so we can release it under lock
		 * contention with the cow code
		 */
		p->nodes[level] = b;
		if (!p->skip_locking)
			p->locks[level] = 1;

C
Chris Mason 已提交
1523 1524
		if (cow) {
			int wret;
1525 1526 1527

			/* is a cow on this block not required */
			if (btrfs_header_generation(b) == trans->transid &&
1528
			    btrfs_header_owner(b) == root->root_key.objectid &&
1529 1530 1531 1532 1533 1534 1535 1536 1537
			    !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
				goto cow_done;
			}

			/* ok, we have to cow, is our old prealloc the right
			 * size?
			 */
			if (prealloc_block.objectid &&
			    prealloc_block.offset != b->len) {
1538
				btrfs_release_path(root, p);
1539 1540 1541 1542
				btrfs_free_reserved_extent(root,
					   prealloc_block.objectid,
					   prealloc_block.offset);
				prealloc_block.objectid = 0;
1543
				goto again;
1544 1545 1546 1547 1548 1549
			}

			/*
			 * for higher level blocks, try not to allocate blocks
			 * with the block and the parent locks held.
			 */
C
Chris Mason 已提交
1550
			if (level > 0 && !prealloc_block.objectid) {
1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562
				u32 size = b->len;
				u64 hint = b->start;

				btrfs_release_path(root, p);
				ret = btrfs_reserve_extent(trans, root,
							   size, size, 0,
							   hint, (u64)-1,
							   &prealloc_block, 0);
				BUG_ON(ret);
				goto again;
			}

1563 1564
			btrfs_set_path_blocking(p);

C
Chris Mason 已提交
1565 1566 1567
			wret = btrfs_cow_block(trans, root, b,
					       p->nodes[level + 1],
					       p->slots[level + 1],
1568 1569
					       &b, prealloc_block.objectid);
			prealloc_block.objectid = 0;
1570
			if (wret) {
1571
				free_extent_buffer(b);
1572 1573
				ret = wret;
				goto done;
1574
			}
C
Chris Mason 已提交
1575
		}
1576
cow_done:
C
Chris Mason 已提交
1577
		BUG_ON(!cow && ins_len);
1578
		if (level != btrfs_header_level(b))
C
Chris Mason 已提交
1579
			WARN_ON(1);
1580
		level = btrfs_header_level(b);
1581

1582
		p->nodes[level] = b;
1583 1584
		if (!p->skip_locking)
			p->locks[level] = 1;
1585

1586
		btrfs_clear_path_blocking(p, NULL);
1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601

		/*
		 * we have a lock on b and as long as we aren't changing
		 * the tree, there is no way to for the items in b to change.
		 * It is safe to drop the lock on our parent before we
		 * go through the expensive btree search on b.
		 *
		 * If cow is true, then we might be changing slot zero,
		 * which may require changing the parent.  So, we can't
		 * drop the lock until after we know which slot we're
		 * operating on.
		 */
		if (!cow)
			btrfs_unlock_up_safe(p, level + 1);

C
Chris Mason 已提交
1602
		ret = check_block(root, p, level);
1603 1604 1605 1606
		if (ret) {
			ret = -1;
			goto done;
		}
1607

1608
		ret = bin_search(b, key, level, &slot);
1609

1610
		if (level != 0) {
1611 1612 1613
			if (ret && slot > 0)
				slot -= 1;
			p->slots[level] = slot;
1614 1615
			if ((p->search_for_split || ins_len > 0) &&
			    btrfs_header_nritems(b) >=
1616
			    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1617 1618 1619 1620 1621 1622 1623 1624
				int sret;

				sret = reada_for_balance(root, p, level);
				if (sret)
					goto again;

				btrfs_set_path_blocking(p);
				sret = split_node(trans, root, p, level);
1625
				btrfs_clear_path_blocking(p, NULL);
1626

C
Chris Mason 已提交
1627
				BUG_ON(sret > 0);
1628 1629 1630 1631
				if (sret) {
					ret = sret;
					goto done;
				}
C
Chris Mason 已提交
1632 1633
				b = p->nodes[level];
				slot = p->slots[level];
1634 1635 1636
			} else if (ins_len < 0 &&
				   btrfs_header_nritems(b) <
				   BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
1637 1638 1639 1640 1641 1642 1643 1644
				int sret;

				sret = reada_for_balance(root, p, level);
				if (sret)
					goto again;

				btrfs_set_path_blocking(p);
				sret = balance_level(trans, root, p, level);
1645
				btrfs_clear_path_blocking(p, NULL);
1646

1647 1648 1649 1650
				if (sret) {
					ret = sret;
					goto done;
				}
1651
				b = p->nodes[level];
1652 1653
				if (!b) {
					btrfs_release_path(NULL, p);
1654
					goto again;
1655
				}
1656
				slot = p->slots[level];
1657
				BUG_ON(btrfs_header_nritems(b) == 1);
C
Chris Mason 已提交
1658
			}
1659 1660
			unlock_up(p, level, lowest_unlock);

1661
			/* this is only true while dropping a snapshot */
1662
			if (level == lowest_level) {
1663 1664
				ret = 0;
				goto done;
1665
			}
1666

1667 1668 1669 1670 1671 1672
			blocknr = btrfs_node_blockptr(b, slot);
			gen = btrfs_node_ptr_generation(b, slot);
			blocksize = btrfs_level_size(root, level - 1);

			tmp = btrfs_find_tree_block(root, blocknr, blocksize);
			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1673 1674 1675 1676 1677 1678 1679
				b = tmp;
			} else {
				/*
				 * reduce lock contention at high levels
				 * of the btree by dropping locks before
				 * we read.
				 */
1680
				if (level > 0) {
1681
					btrfs_release_path(NULL, p);
1682 1683
					if (tmp)
						free_extent_buffer(tmp);
1684 1685 1686 1687 1688
					if (should_reada)
						reada_for_search(root, p,
								 level, slot,
								 key->objectid);

1689 1690
					tmp = read_tree_block(root, blocknr,
							 blocksize, gen);
1691 1692 1693 1694
					if (tmp)
						free_extent_buffer(tmp);
					goto again;
				} else {
1695
					btrfs_set_path_blocking(p);
1696 1697
					if (tmp)
						free_extent_buffer(tmp);
1698 1699 1700 1701
					if (should_reada)
						reada_for_search(root, p,
								 level, slot,
								 key->objectid);
1702 1703 1704
					b = read_node_slot(root, b, slot);
				}
			}
1705 1706 1707
			if (!p->skip_locking) {
				int lret;

1708
				btrfs_clear_path_blocking(p, NULL);
1709 1710 1711 1712 1713
				lret = btrfs_try_spin_lock(b);

				if (!lret) {
					btrfs_set_path_blocking(p);
					btrfs_tree_lock(b);
1714
					btrfs_clear_path_blocking(p, b);
1715 1716
				}
			}
1717 1718
		} else {
			p->slots[level] = slot;
1719 1720
			if (ins_len > 0 &&
			    btrfs_leaf_free_space(root, b) < ins_len) {
1721 1722 1723 1724
				int sret;

				btrfs_set_path_blocking(p);
				sret = split_leaf(trans, root, key,
1725
						      p, ins_len, ret == 0);
1726
				btrfs_clear_path_blocking(p, NULL);
1727

C
Chris Mason 已提交
1728
				BUG_ON(sret > 0);
1729 1730 1731 1732
				if (sret) {
					ret = sret;
					goto done;
				}
C
Chris Mason 已提交
1733
			}
1734 1735
			if (!p->search_for_split)
				unlock_up(p, level, lowest_unlock);
1736
			goto done;
1737 1738
		}
	}
1739 1740
	ret = 1;
done:
1741 1742 1743 1744 1745
	/*
	 * we don't really know what they plan on doing with the path
	 * from here on, so for now just mark it as blocking
	 */
	btrfs_set_path_blocking(p);
1746 1747 1748 1749 1750 1751
	if (prealloc_block.objectid) {
		btrfs_free_reserved_extent(root,
			   prealloc_block.objectid,
			   prealloc_block.offset);
	}
	return ret;
1752 1753
}

Z
Zheng Yan 已提交
1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773
int btrfs_merge_path(struct btrfs_trans_handle *trans,
		     struct btrfs_root *root,
		     struct btrfs_key *node_keys,
		     u64 *nodes, int lowest_level)
{
	struct extent_buffer *eb;
	struct extent_buffer *parent;
	struct btrfs_key key;
	u64 bytenr;
	u64 generation;
	u32 blocksize;
	int level;
	int slot;
	int key_match;
	int ret;

	eb = btrfs_lock_root_node(root);
	ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
	BUG_ON(ret);

1774 1775
	btrfs_set_lock_blocking(eb);

Z
Zheng Yan 已提交
1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795
	parent = eb;
	while (1) {
		level = btrfs_header_level(parent);
		if (level == 0 || level <= lowest_level)
			break;

		ret = bin_search(parent, &node_keys[lowest_level], level,
				 &slot);
		if (ret && slot > 0)
			slot--;

		bytenr = btrfs_node_blockptr(parent, slot);
		if (nodes[level - 1] == bytenr)
			break;

		blocksize = btrfs_level_size(root, level - 1);
		generation = btrfs_node_ptr_generation(parent, slot);
		btrfs_node_key_to_cpu(eb, &key, slot);
		key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));

Y
Yan Zheng 已提交
1796 1797 1798 1799
		if (generation == trans->transid) {
			eb = read_tree_block(root, bytenr, blocksize,
					     generation);
			btrfs_tree_lock(eb);
1800
			btrfs_set_lock_blocking(eb);
Y
Yan Zheng 已提交
1801 1802
		}

Z
Zheng Yan 已提交
1803 1804 1805
		/*
		 * if node keys match and node pointer hasn't been modified
		 * in the running transaction, we can merge the path. for
Y
Yan Zheng 已提交
1806 1807 1808
		 * blocks owened by reloc trees, the node pointer check is
		 * skipped, this is because these blocks are fully controlled
		 * by the space balance code, no one else can modify them.
Z
Zheng Yan 已提交
1809 1810 1811
		 */
		if (!nodes[level - 1] || !key_match ||
		    (generation == trans->transid &&
Y
Yan Zheng 已提交
1812 1813 1814 1815 1816 1817
		     btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
			if (level == 1 || level == lowest_level + 1) {
				if (generation == trans->transid) {
					btrfs_tree_unlock(eb);
					free_extent_buffer(eb);
				}
Z
Zheng Yan 已提交
1818
				break;
Y
Yan Zheng 已提交
1819
			}
Z
Zheng Yan 已提交
1820

Y
Yan Zheng 已提交
1821 1822 1823 1824
			if (generation != trans->transid) {
				eb = read_tree_block(root, bytenr, blocksize,
						generation);
				btrfs_tree_lock(eb);
1825
				btrfs_set_lock_blocking(eb);
Y
Yan Zheng 已提交
1826
			}
Z
Zheng Yan 已提交
1827 1828 1829 1830 1831

			ret = btrfs_cow_block(trans, root, eb, parent, slot,
					      &eb, 0);
			BUG_ON(ret);

Y
Yan Zheng 已提交
1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842
			if (root->root_key.objectid ==
			    BTRFS_TREE_RELOC_OBJECTID) {
				if (!nodes[level - 1]) {
					nodes[level - 1] = eb->start;
					memcpy(&node_keys[level - 1], &key,
					       sizeof(node_keys[0]));
				} else {
					WARN_ON(1);
				}
			}

Z
Zheng Yan 已提交
1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857
			btrfs_tree_unlock(parent);
			free_extent_buffer(parent);
			parent = eb;
			continue;
		}

		btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
		btrfs_set_node_ptr_generation(parent, slot, trans->transid);
		btrfs_mark_buffer_dirty(parent);

		ret = btrfs_inc_extent_ref(trans, root,
					nodes[level - 1],
					blocksize, parent->start,
					btrfs_header_owner(parent),
					btrfs_header_generation(parent),
1858
					level - 1);
Z
Zheng Yan 已提交
1859 1860
		BUG_ON(ret);

Y
Yan Zheng 已提交
1861 1862 1863 1864 1865
		/*
		 * If the block was created in the running transaction,
		 * it's possible this is the last reference to it, so we
		 * should drop the subtree.
		 */
Z
Zheng Yan 已提交
1866
		if (generation == trans->transid) {
Y
Yan Zheng 已提交
1867 1868
			ret = btrfs_drop_subtree(trans, root, eb, parent);
			BUG_ON(ret);
Z
Zheng Yan 已提交
1869 1870
			btrfs_tree_unlock(eb);
			free_extent_buffer(eb);
Y
Yan Zheng 已提交
1871 1872 1873 1874 1875 1876 1877
		} else {
			ret = btrfs_free_extent(trans, root, bytenr,
					blocksize, parent->start,
					btrfs_header_owner(parent),
					btrfs_header_generation(parent),
					level - 1, 1);
			BUG_ON(ret);
Z
Zheng Yan 已提交
1878 1879 1880 1881 1882 1883 1884 1885
		}
		break;
	}
	btrfs_tree_unlock(parent);
	free_extent_buffer(parent);
	return 0;
}

C
Chris Mason 已提交
1886 1887 1888 1889 1890 1891
/*
 * adjust the pointers going up the tree, starting at level
 * making sure the right key of each node is points to 'key'.
 * This is used after shifting pointers to the left, so it stops
 * fixing up pointers when a given leaf/node is not in slot 0 of the
 * higher levels
C
Chris Mason 已提交
1892 1893 1894
 *
 * If this fails to write a tree block, it returns -1, but continues
 * fixing up the blocks in ram so the tree is consistent.
C
Chris Mason 已提交
1895
 */
1896 1897 1898
static int fixup_low_keys(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root, struct btrfs_path *path,
			  struct btrfs_disk_key *key, int level)
1899 1900
{
	int i;
C
Chris Mason 已提交
1901
	int ret = 0;
1902 1903
	struct extent_buffer *t;

C
Chris Mason 已提交
1904
	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1905
		int tslot = path->slots[i];
1906
		if (!path->nodes[i])
1907
			break;
1908 1909
		t = path->nodes[i];
		btrfs_set_node_key(t, key, tslot);
C
Chris Mason 已提交
1910
		btrfs_mark_buffer_dirty(path->nodes[i]);
1911 1912 1913
		if (tslot != 0)
			break;
	}
C
Chris Mason 已提交
1914
	return ret;
1915 1916
}

Z
Zheng Yan 已提交
1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951
/*
 * update item key.
 *
 * This function isn't completely safe. It's the caller's responsibility
 * that the new key won't break the order
 */
int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root, struct btrfs_path *path,
			    struct btrfs_key *new_key)
{
	struct btrfs_disk_key disk_key;
	struct extent_buffer *eb;
	int slot;

	eb = path->nodes[0];
	slot = path->slots[0];
	if (slot > 0) {
		btrfs_item_key(eb, &disk_key, slot - 1);
		if (comp_keys(&disk_key, new_key) >= 0)
			return -1;
	}
	if (slot < btrfs_header_nritems(eb) - 1) {
		btrfs_item_key(eb, &disk_key, slot + 1);
		if (comp_keys(&disk_key, new_key) <= 0)
			return -1;
	}

	btrfs_cpu_key_to_disk(&disk_key, new_key);
	btrfs_set_item_key(eb, &disk_key, slot);
	btrfs_mark_buffer_dirty(eb);
	if (slot == 0)
		fixup_low_keys(trans, root, path, &disk_key, 1);
	return 0;
}

C
Chris Mason 已提交
1952 1953
/*
 * try to push data from one node into the next node left in the
1954
 * tree.
C
Chris Mason 已提交
1955 1956 1957
 *
 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
 * error, and > 0 if there was no room in the left hand block.
C
Chris Mason 已提交
1958
 */
1959 1960
static int push_node_left(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root, struct extent_buffer *dst,
1961
			  struct extent_buffer *src, int empty)
1962 1963
{
	int push_items = 0;
1964 1965
	int src_nritems;
	int dst_nritems;
C
Chris Mason 已提交
1966
	int ret = 0;
1967

1968 1969
	src_nritems = btrfs_header_nritems(src);
	dst_nritems = btrfs_header_nritems(dst);
C
Chris Mason 已提交
1970
	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1971 1972
	WARN_ON(btrfs_header_generation(src) != trans->transid);
	WARN_ON(btrfs_header_generation(dst) != trans->transid);
1973

1974
	if (!empty && src_nritems <= 8)
1975 1976
		return 1;

C
Chris Mason 已提交
1977
	if (push_items <= 0)
1978 1979
		return 1;

1980
	if (empty) {
1981
		push_items = min(src_nritems, push_items);
1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993
		if (push_items < src_nritems) {
			/* leave at least 8 pointers in the node if
			 * we aren't going to empty it
			 */
			if (src_nritems - push_items < 8) {
				if (push_items <= 8)
					return 1;
				push_items -= 8;
			}
		}
	} else
		push_items = min(src_nritems - 8, push_items);
1994

1995 1996 1997
	copy_extent_buffer(dst, src,
			   btrfs_node_key_ptr_offset(dst_nritems),
			   btrfs_node_key_ptr_offset(0),
C
Chris Mason 已提交
1998
			   push_items * sizeof(struct btrfs_key_ptr));
1999

2000
	if (push_items < src_nritems) {
2001 2002 2003 2004 2005 2006 2007 2008 2009
		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
				      btrfs_node_key_ptr_offset(push_items),
				      (src_nritems - push_items) *
				      sizeof(struct btrfs_key_ptr));
	}
	btrfs_set_header_nritems(src, src_nritems - push_items);
	btrfs_set_header_nritems(dst, dst_nritems + push_items);
	btrfs_mark_buffer_dirty(src);
	btrfs_mark_buffer_dirty(dst);
Z
Zheng Yan 已提交
2010 2011 2012 2013

	ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
	BUG_ON(ret);

2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025
	return ret;
}

/*
 * try to push data from one node into the next node right in the
 * tree.
 *
 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
 * error, and > 0 if there was no room in the right hand block.
 *
 * this will  only push up to 1/2 the contents of the left node over
 */
2026 2027 2028 2029
static int balance_node_right(struct btrfs_trans_handle *trans,
			      struct btrfs_root *root,
			      struct extent_buffer *dst,
			      struct extent_buffer *src)
2030 2031 2032 2033 2034 2035 2036
{
	int push_items = 0;
	int max_push;
	int src_nritems;
	int dst_nritems;
	int ret = 0;

2037 2038 2039
	WARN_ON(btrfs_header_generation(src) != trans->transid);
	WARN_ON(btrfs_header_generation(dst) != trans->transid);

2040 2041
	src_nritems = btrfs_header_nritems(src);
	dst_nritems = btrfs_header_nritems(dst);
C
Chris Mason 已提交
2042
	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
C
Chris Mason 已提交
2043
	if (push_items <= 0)
2044
		return 1;
2045

C
Chris Mason 已提交
2046
	if (src_nritems < 4)
2047
		return 1;
2048 2049 2050

	max_push = src_nritems / 2 + 1;
	/* don't try to empty the node */
C
Chris Mason 已提交
2051
	if (max_push >= src_nritems)
2052
		return 1;
Y
Yan 已提交
2053

2054 2055 2056
	if (max_push < push_items)
		push_items = max_push;

2057 2058 2059 2060
	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
				      btrfs_node_key_ptr_offset(0),
				      (dst_nritems) *
				      sizeof(struct btrfs_key_ptr));
C
Chris Mason 已提交
2061

2062 2063 2064
	copy_extent_buffer(dst, src,
			   btrfs_node_key_ptr_offset(0),
			   btrfs_node_key_ptr_offset(src_nritems - push_items),
C
Chris Mason 已提交
2065
			   push_items * sizeof(struct btrfs_key_ptr));
2066

2067 2068
	btrfs_set_header_nritems(src, src_nritems - push_items);
	btrfs_set_header_nritems(dst, dst_nritems + push_items);
2069

2070 2071
	btrfs_mark_buffer_dirty(src);
	btrfs_mark_buffer_dirty(dst);
Z
Zheng Yan 已提交
2072 2073 2074 2075

	ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
	BUG_ON(ret);

C
Chris Mason 已提交
2076
	return ret;
2077 2078
}

C
Chris Mason 已提交
2079 2080 2081 2082
/*
 * helper function to insert a new root level in the tree.
 * A new node is allocated, and a single item is inserted to
 * point to the existing root
C
Chris Mason 已提交
2083 2084
 *
 * returns zero on success or < 0 on failure.
C
Chris Mason 已提交
2085
 */
C
Chris Mason 已提交
2086
static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2087 2088
			   struct btrfs_root *root,
			   struct btrfs_path *path, int level)
C
Chris Mason 已提交
2089
{
2090
	u64 lower_gen;
2091 2092
	struct extent_buffer *lower;
	struct extent_buffer *c;
2093
	struct extent_buffer *old;
2094
	struct btrfs_disk_key lower_key;
Z
Zheng Yan 已提交
2095
	int ret;
C
Chris Mason 已提交
2096 2097 2098 2099

	BUG_ON(path->nodes[level]);
	BUG_ON(path->nodes[level-1] != root->node);

2100 2101 2102 2103 2104 2105
	lower = path->nodes[level-1];
	if (level == 1)
		btrfs_item_key(lower, &lower_key, 0);
	else
		btrfs_node_key(lower, &lower_key, 0);

Z
Zheng Yan 已提交
2106 2107
	c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
				   root->root_key.objectid, trans->transid,
2108
				   level, root->node->start, 0);
2109 2110
	if (IS_ERR(c))
		return PTR_ERR(c);
2111

2112 2113 2114
	memset_extent_buffer(c, 0, 0, root->nodesize);
	btrfs_set_header_nritems(c, 1);
	btrfs_set_header_level(c, level);
2115
	btrfs_set_header_bytenr(c, c->start);
2116 2117 2118 2119 2120 2121
	btrfs_set_header_generation(c, trans->transid);
	btrfs_set_header_owner(c, root->root_key.objectid);

	write_extent_buffer(c, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(c),
			    BTRFS_FSID_SIZE);
2122 2123 2124 2125 2126

	write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
			    (unsigned long)btrfs_header_chunk_tree_uuid(c),
			    BTRFS_UUID_SIZE);

2127
	btrfs_set_node_key(c, &lower_key, 0);
2128
	btrfs_set_node_blockptr(c, 0, lower->start);
2129
	lower_gen = btrfs_header_generation(lower);
Z
Zheng Yan 已提交
2130
	WARN_ON(lower_gen != trans->transid);
2131 2132

	btrfs_set_node_ptr_generation(c, 0, lower_gen);
2133

2134
	btrfs_mark_buffer_dirty(c);
2135

2136 2137
	spin_lock(&root->node_lock);
	old = root->node;
2138
	root->node = c;
2139 2140
	spin_unlock(&root->node_lock);

Z
Zheng Yan 已提交
2141 2142 2143
	ret = btrfs_update_extent_ref(trans, root, lower->start,
				      lower->start, c->start,
				      root->root_key.objectid,
2144
				      trans->transid, level - 1);
Z
Zheng Yan 已提交
2145 2146
	BUG_ON(ret);

2147 2148 2149
	/* the super has an extra ref to root->node */
	free_extent_buffer(old);

2150
	add_root_to_dirty_list(root);
2151 2152
	extent_buffer_get(c);
	path->nodes[level] = c;
2153
	path->locks[level] = 1;
C
Chris Mason 已提交
2154 2155 2156 2157
	path->slots[level] = 0;
	return 0;
}

C
Chris Mason 已提交
2158 2159 2160
/*
 * worker function to insert a single pointer in a node.
 * the node should have enough room for the pointer already
C
Chris Mason 已提交
2161
 *
C
Chris Mason 已提交
2162 2163
 * slot and level indicate where you want the key to go, and
 * blocknr is the block the key points to.
C
Chris Mason 已提交
2164 2165
 *
 * returns zero on success and < 0 on any error
C
Chris Mason 已提交
2166
 */
2167 2168
static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
		      *root, struct btrfs_path *path, struct btrfs_disk_key
2169
		      *key, u64 bytenr, int slot, int level)
C
Chris Mason 已提交
2170
{
2171
	struct extent_buffer *lower;
C
Chris Mason 已提交
2172
	int nritems;
C
Chris Mason 已提交
2173 2174

	BUG_ON(!path->nodes[level]);
2175 2176
	lower = path->nodes[level];
	nritems = btrfs_header_nritems(lower);
C
Chris Mason 已提交
2177 2178
	if (slot > nritems)
		BUG();
C
Chris Mason 已提交
2179
	if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
C
Chris Mason 已提交
2180 2181
		BUG();
	if (slot != nritems) {
2182 2183 2184
		memmove_extent_buffer(lower,
			      btrfs_node_key_ptr_offset(slot + 1),
			      btrfs_node_key_ptr_offset(slot),
C
Chris Mason 已提交
2185
			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
C
Chris Mason 已提交
2186
	}
2187
	btrfs_set_node_key(lower, key, slot);
2188
	btrfs_set_node_blockptr(lower, slot, bytenr);
2189 2190
	WARN_ON(trans->transid == 0);
	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2191 2192
	btrfs_set_header_nritems(lower, nritems + 1);
	btrfs_mark_buffer_dirty(lower);
C
Chris Mason 已提交
2193 2194 2195
	return 0;
}

C
Chris Mason 已提交
2196 2197 2198 2199 2200 2201
/*
 * split the node at the specified level in path in two.
 * The path is corrected to point to the appropriate node after the split
 *
 * Before splitting this tries to make some room in the node by pushing
 * left and right, if either one works, it returns right away.
C
Chris Mason 已提交
2202 2203
 *
 * returns 0 on success and < 0 on failure
C
Chris Mason 已提交
2204
 */
2205 2206 2207
static noinline int split_node(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root,
			       struct btrfs_path *path, int level)
2208
{
2209 2210 2211
	struct extent_buffer *c;
	struct extent_buffer *split;
	struct btrfs_disk_key disk_key;
2212
	int mid;
C
Chris Mason 已提交
2213
	int ret;
C
Chris Mason 已提交
2214
	int wret;
2215
	u32 c_nritems;
2216

2217
	c = path->nodes[level];
2218
	WARN_ON(btrfs_header_generation(c) != trans->transid);
2219
	if (c == root->node) {
C
Chris Mason 已提交
2220
		/* trying to split the root, lets make a new one */
2221
		ret = insert_new_root(trans, root, path, level + 1);
C
Chris Mason 已提交
2222 2223
		if (ret)
			return ret;
2224 2225
	} else {
		ret = push_nodes_for_insert(trans, root, path, level);
2226 2227
		c = path->nodes[level];
		if (!ret && btrfs_header_nritems(c) <
2228
		    BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
2229
			return 0;
2230 2231
		if (ret < 0)
			return ret;
2232
	}
2233

2234
	c_nritems = btrfs_header_nritems(c);
2235

2236
	split = btrfs_alloc_free_block(trans, root, root->nodesize,
Z
Zheng Yan 已提交
2237 2238 2239
					path->nodes[level + 1]->start,
					root->root_key.objectid,
					trans->transid, level, c->start, 0);
2240 2241 2242 2243 2244
	if (IS_ERR(split))
		return PTR_ERR(split);

	btrfs_set_header_flags(split, btrfs_header_flags(c));
	btrfs_set_header_level(split, btrfs_header_level(c));
2245
	btrfs_set_header_bytenr(split, split->start);
2246 2247
	btrfs_set_header_generation(split, trans->transid);
	btrfs_set_header_owner(split, root->root_key.objectid);
2248
	btrfs_set_header_flags(split, 0);
2249 2250 2251
	write_extent_buffer(split, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(split),
			    BTRFS_FSID_SIZE);
2252 2253 2254
	write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
			    (unsigned long)btrfs_header_chunk_tree_uuid(split),
			    BTRFS_UUID_SIZE);
2255

2256
	mid = (c_nritems + 1) / 2;
2257 2258 2259 2260 2261 2262 2263

	copy_extent_buffer(split, c,
			   btrfs_node_key_ptr_offset(0),
			   btrfs_node_key_ptr_offset(mid),
			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
	btrfs_set_header_nritems(split, c_nritems - mid);
	btrfs_set_header_nritems(c, mid);
C
Chris Mason 已提交
2264 2265
	ret = 0;

2266 2267 2268 2269
	btrfs_mark_buffer_dirty(c);
	btrfs_mark_buffer_dirty(split);

	btrfs_node_key(split, &disk_key, 0);
2270
	wret = insert_ptr(trans, root, path, &disk_key, split->start,
2271
			  path->slots[level + 1] + 1,
C
Chris Mason 已提交
2272
			  level + 1);
C
Chris Mason 已提交
2273 2274 2275
	if (wret)
		ret = wret;

Z
Zheng Yan 已提交
2276 2277 2278
	ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
	BUG_ON(ret);

C
Chris Mason 已提交
2279
	if (path->slots[level] >= mid) {
C
Chris Mason 已提交
2280
		path->slots[level] -= mid;
2281
		btrfs_tree_unlock(c);
2282 2283
		free_extent_buffer(c);
		path->nodes[level] = split;
C
Chris Mason 已提交
2284 2285
		path->slots[level + 1] += 1;
	} else {
2286
		btrfs_tree_unlock(split);
2287
		free_extent_buffer(split);
2288
	}
C
Chris Mason 已提交
2289
	return ret;
2290 2291
}

C
Chris Mason 已提交
2292 2293 2294 2295 2296
/*
 * how many bytes are required to store the items in a leaf.  start
 * and nr indicate which items in the leaf to check.  This totals up the
 * space used both by the item structs and the item data
 */
2297
static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2298 2299
{
	int data_len;
2300
	int nritems = btrfs_header_nritems(l);
2301
	int end = min(nritems, start + nr) - 1;
2302 2303 2304

	if (!nr)
		return 0;
2305 2306
	data_len = btrfs_item_end_nr(l, start);
	data_len = data_len - btrfs_item_offset_nr(l, end);
C
Chris Mason 已提交
2307
	data_len += sizeof(struct btrfs_item) * nr;
2308
	WARN_ON(data_len < 0);
2309 2310 2311
	return data_len;
}

2312 2313 2314 2315 2316
/*
 * The space between the end of the leaf items and
 * the start of the leaf data.  IOW, how much room
 * the leaf has left for both items and data
 */
C
Chris Mason 已提交
2317
noinline int btrfs_leaf_free_space(struct btrfs_root *root,
2318
				   struct extent_buffer *leaf)
2319
{
2320 2321 2322 2323
	int nritems = btrfs_header_nritems(leaf);
	int ret;
	ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
	if (ret < 0) {
C
Chris Mason 已提交
2324 2325
		printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
		       "used %d nritems %d\n",
J
Jens Axboe 已提交
2326
		       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
2327 2328 2329
		       leaf_space_used(leaf, 0, nritems), nritems);
	}
	return ret;
2330 2331
}

C
Chris Mason 已提交
2332 2333 2334
/*
 * push some data in the path leaf to the right, trying to free up at
 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
C
Chris Mason 已提交
2335 2336 2337
 *
 * returns 1 if the push failed because the other node didn't have enough
 * room, 0 if everything worked out and < 0 if there were major errors.
C
Chris Mason 已提交
2338
 */
2339
static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2340 2341
			   *root, struct btrfs_path *path, int data_size,
			   int empty)
C
Chris Mason 已提交
2342
{
2343 2344 2345 2346
	struct extent_buffer *left = path->nodes[0];
	struct extent_buffer *right;
	struct extent_buffer *upper;
	struct btrfs_disk_key disk_key;
C
Chris Mason 已提交
2347
	int slot;
2348
	u32 i;
C
Chris Mason 已提交
2349 2350 2351
	int free_space;
	int push_space = 0;
	int push_items = 0;
C
Chris Mason 已提交
2352
	struct btrfs_item *item;
2353
	u32 left_nritems;
2354
	u32 nr;
2355
	u32 right_nritems;
2356
	u32 data_end;
2357
	u32 this_item_size;
2358
	int ret;
C
Chris Mason 已提交
2359 2360

	slot = path->slots[1];
C
Chris Mason 已提交
2361
	if (!path->nodes[1])
C
Chris Mason 已提交
2362
		return 1;
C
Chris Mason 已提交
2363

C
Chris Mason 已提交
2364
	upper = path->nodes[1];
2365
	if (slot >= btrfs_header_nritems(upper) - 1)
C
Chris Mason 已提交
2366
		return 1;
2367

2368
	btrfs_assert_tree_locked(path->nodes[1]);
2369

2370
	right = read_node_slot(root, upper, slot + 1);
2371
	btrfs_tree_lock(right);
2372 2373
	btrfs_set_lock_blocking(right);

C
Chris Mason 已提交
2374
	free_space = btrfs_leaf_free_space(root, right);
2375
	if (free_space < data_size)
2376
		goto out_unlock;
2377

C
Chris Mason 已提交
2378
	/* cow and double check */
2379
	ret = btrfs_cow_block(trans, root, right, upper,
2380
			      slot + 1, &right, 0);
2381 2382 2383
	if (ret)
		goto out_unlock;

C
Chris Mason 已提交
2384
	free_space = btrfs_leaf_free_space(root, right);
2385
	if (free_space < data_size)
2386
		goto out_unlock;
C
Chris Mason 已提交
2387

2388
	left_nritems = btrfs_header_nritems(left);
2389 2390
	if (left_nritems == 0)
		goto out_unlock;
2391

2392 2393 2394 2395 2396
	if (empty)
		nr = 0;
	else
		nr = 1;

Z
Zheng Yan 已提交
2397
	if (path->slots[0] >= left_nritems)
2398
		push_space += data_size;
Z
Zheng Yan 已提交
2399

2400 2401
	i = left_nritems - 1;
	while (i >= nr) {
2402
		item = btrfs_item_nr(left, i);
2403

Z
Zheng Yan 已提交
2404 2405 2406 2407 2408 2409 2410 2411 2412 2413
		if (!empty && push_items > 0) {
			if (path->slots[0] > i)
				break;
			if (path->slots[0] == i) {
				int space = btrfs_leaf_free_space(root, left);
				if (space + push_space * 2 > free_space)
					break;
			}
		}

C
Chris Mason 已提交
2414
		if (path->slots[0] == i)
2415
			push_space += data_size;
2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426

		if (!left->map_token) {
			map_extent_buffer(left, (unsigned long)item,
					sizeof(struct btrfs_item),
					&left->map_token, &left->kaddr,
					&left->map_start, &left->map_len,
					KM_USER1);
		}

		this_item_size = btrfs_item_size(left, item);
		if (this_item_size + sizeof(*item) + push_space > free_space)
C
Chris Mason 已提交
2427
			break;
Z
Zheng Yan 已提交
2428

C
Chris Mason 已提交
2429
		push_items++;
2430
		push_space += this_item_size + sizeof(*item);
2431 2432 2433
		if (i == 0)
			break;
		i--;
2434 2435 2436 2437
	}
	if (left->map_token) {
		unmap_extent_buffer(left, left->map_token, KM_USER1);
		left->map_token = NULL;
C
Chris Mason 已提交
2438
	}
2439

2440 2441
	if (push_items == 0)
		goto out_unlock;
2442

2443
	if (!empty && push_items == left_nritems)
2444
		WARN_ON(1);
2445

C
Chris Mason 已提交
2446
	/* push left to right */
2447
	right_nritems = btrfs_header_nritems(right);
2448

2449
	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
C
Chris Mason 已提交
2450
	push_space -= leaf_data_end(root, left);
2451

C
Chris Mason 已提交
2452
	/* make room in the right data area */
2453 2454 2455 2456 2457 2458
	data_end = leaf_data_end(root, right);
	memmove_extent_buffer(right,
			      btrfs_leaf_data(right) + data_end - push_space,
			      btrfs_leaf_data(right) + data_end,
			      BTRFS_LEAF_DATA_SIZE(root) - data_end);

C
Chris Mason 已提交
2459
	/* copy from the left data area */
2460
	copy_extent_buffer(right, left, btrfs_leaf_data(right) +
C
Chris Mason 已提交
2461 2462 2463
		     BTRFS_LEAF_DATA_SIZE(root) - push_space,
		     btrfs_leaf_data(left) + leaf_data_end(root, left),
		     push_space);
2464 2465 2466 2467 2468

	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
			      btrfs_item_nr_offset(0),
			      right_nritems * sizeof(struct btrfs_item));

C
Chris Mason 已提交
2469
	/* copy the items from left to right */
2470 2471 2472
	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
		   btrfs_item_nr_offset(left_nritems - push_items),
		   push_items * sizeof(struct btrfs_item));
C
Chris Mason 已提交
2473 2474

	/* update the item pointers */
2475
	right_nritems += push_items;
2476
	btrfs_set_header_nritems(right, right_nritems);
C
Chris Mason 已提交
2477
	push_space = BTRFS_LEAF_DATA_SIZE(root);
2478
	for (i = 0; i < right_nritems; i++) {
2479
		item = btrfs_item_nr(right, i);
2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493
		if (!right->map_token) {
			map_extent_buffer(right, (unsigned long)item,
					sizeof(struct btrfs_item),
					&right->map_token, &right->kaddr,
					&right->map_start, &right->map_len,
					KM_USER1);
		}
		push_space -= btrfs_item_size(right, item);
		btrfs_set_item_offset(right, item, push_space);
	}

	if (right->map_token) {
		unmap_extent_buffer(right, right->map_token, KM_USER1);
		right->map_token = NULL;
C
Chris Mason 已提交
2494
	}
2495
	left_nritems -= push_items;
2496
	btrfs_set_header_nritems(left, left_nritems);
C
Chris Mason 已提交
2497

2498 2499
	if (left_nritems)
		btrfs_mark_buffer_dirty(left);
2500
	btrfs_mark_buffer_dirty(right);
2501

Z
Zheng Yan 已提交
2502 2503 2504
	ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
	BUG_ON(ret);

2505 2506
	btrfs_item_key(right, &disk_key, 0);
	btrfs_set_node_key(upper, &disk_key, slot + 1);
C
Chris Mason 已提交
2507
	btrfs_mark_buffer_dirty(upper);
C
Chris Mason 已提交
2508

C
Chris Mason 已提交
2509
	/* then fixup the leaf pointer in the path */
2510 2511
	if (path->slots[0] >= left_nritems) {
		path->slots[0] -= left_nritems;
2512 2513 2514
		if (btrfs_header_nritems(path->nodes[0]) == 0)
			clean_tree_block(trans, root, path->nodes[0]);
		btrfs_tree_unlock(path->nodes[0]);
2515 2516
		free_extent_buffer(path->nodes[0]);
		path->nodes[0] = right;
C
Chris Mason 已提交
2517 2518
		path->slots[1] += 1;
	} else {
2519
		btrfs_tree_unlock(right);
2520
		free_extent_buffer(right);
C
Chris Mason 已提交
2521 2522
	}
	return 0;
2523 2524 2525 2526 2527

out_unlock:
	btrfs_tree_unlock(right);
	free_extent_buffer(right);
	return 1;
C
Chris Mason 已提交
2528
}
2529

C
Chris Mason 已提交
2530 2531 2532 2533
/*
 * push some data in the path leaf to the left, trying to free up at
 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
 */
2534
static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2535 2536
			  *root, struct btrfs_path *path, int data_size,
			  int empty)
2537
{
2538 2539 2540
	struct btrfs_disk_key disk_key;
	struct extent_buffer *right = path->nodes[0];
	struct extent_buffer *left;
2541 2542 2543 2544 2545
	int slot;
	int i;
	int free_space;
	int push_space = 0;
	int push_items = 0;
C
Chris Mason 已提交
2546
	struct btrfs_item *item;
2547
	u32 old_left_nritems;
2548
	u32 right_nritems;
2549
	u32 nr;
C
Chris Mason 已提交
2550 2551
	int ret = 0;
	int wret;
2552 2553
	u32 this_item_size;
	u32 old_left_item_size;
2554 2555

	slot = path->slots[1];
2556
	if (slot == 0)
2557
		return 1;
2558
	if (!path->nodes[1])
2559
		return 1;
2560

2561
	right_nritems = btrfs_header_nritems(right);
C
Chris Mason 已提交
2562
	if (right_nritems == 0)
2563 2564
		return 1;

2565
	btrfs_assert_tree_locked(path->nodes[1]);
2566

2567
	left = read_node_slot(root, path->nodes[1], slot - 1);
2568
	btrfs_tree_lock(left);
2569 2570
	btrfs_set_lock_blocking(left);

C
Chris Mason 已提交
2571
	free_space = btrfs_leaf_free_space(root, left);
2572
	if (free_space < data_size) {
2573 2574
		ret = 1;
		goto out;
2575
	}
C
Chris Mason 已提交
2576 2577

	/* cow and double check */
2578
	ret = btrfs_cow_block(trans, root, left,
2579
			      path->nodes[1], slot - 1, &left, 0);
2580 2581
	if (ret) {
		/* we hit -ENOSPC, but it isn't fatal here */
2582 2583
		ret = 1;
		goto out;
2584
	}
2585

C
Chris Mason 已提交
2586
	free_space = btrfs_leaf_free_space(root, left);
2587
	if (free_space < data_size) {
2588 2589
		ret = 1;
		goto out;
C
Chris Mason 已提交
2590 2591
	}

2592 2593 2594 2595 2596 2597
	if (empty)
		nr = right_nritems;
	else
		nr = right_nritems - 1;

	for (i = 0; i < nr; i++) {
2598
		item = btrfs_item_nr(right, i);
2599 2600 2601 2602 2603 2604 2605 2606
		if (!right->map_token) {
			map_extent_buffer(right, (unsigned long)item,
					sizeof(struct btrfs_item),
					&right->map_token, &right->kaddr,
					&right->map_start, &right->map_len,
					KM_USER1);
		}

Z
Zheng Yan 已提交
2607 2608 2609 2610 2611 2612 2613 2614 2615 2616
		if (!empty && push_items > 0) {
			if (path->slots[0] < i)
				break;
			if (path->slots[0] == i) {
				int space = btrfs_leaf_free_space(root, right);
				if (space + push_space * 2 > free_space)
					break;
			}
		}

2617
		if (path->slots[0] == i)
2618
			push_space += data_size;
2619 2620 2621

		this_item_size = btrfs_item_size(right, item);
		if (this_item_size + sizeof(*item) + push_space > free_space)
2622
			break;
2623

2624
		push_items++;
2625 2626 2627 2628 2629 2630
		push_space += this_item_size + sizeof(*item);
	}

	if (right->map_token) {
		unmap_extent_buffer(right, right->map_token, KM_USER1);
		right->map_token = NULL;
2631
	}
2632

2633
	if (push_items == 0) {
2634 2635
		ret = 1;
		goto out;
2636
	}
2637
	if (!empty && push_items == btrfs_header_nritems(right))
2638
		WARN_ON(1);
2639

2640
	/* push data from right to left */
2641 2642 2643 2644 2645
	copy_extent_buffer(left, right,
			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
			   btrfs_item_nr_offset(0),
			   push_items * sizeof(struct btrfs_item));

C
Chris Mason 已提交
2646
	push_space = BTRFS_LEAF_DATA_SIZE(root) -
C
Chris Mason 已提交
2647
		     btrfs_item_offset_nr(right, push_items - 1);
2648 2649

	copy_extent_buffer(left, right, btrfs_leaf_data(left) +
C
Chris Mason 已提交
2650 2651
		     leaf_data_end(root, left) - push_space,
		     btrfs_leaf_data(right) +
2652
		     btrfs_item_offset_nr(right, push_items - 1),
C
Chris Mason 已提交
2653
		     push_space);
2654
	old_left_nritems = btrfs_header_nritems(left);
2655
	BUG_ON(old_left_nritems <= 0);
2656

2657
	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
C
Chris Mason 已提交
2658
	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2659
		u32 ioff;
2660

2661
		item = btrfs_item_nr(left, i);
2662 2663 2664 2665 2666 2667 2668 2669
		if (!left->map_token) {
			map_extent_buffer(left, (unsigned long)item,
					sizeof(struct btrfs_item),
					&left->map_token, &left->kaddr,
					&left->map_start, &left->map_len,
					KM_USER1);
		}

2670 2671
		ioff = btrfs_item_offset(left, item);
		btrfs_set_item_offset(left, item,
2672
		      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
2673
	}
2674
	btrfs_set_header_nritems(left, old_left_nritems + push_items);
2675 2676 2677 2678
	if (left->map_token) {
		unmap_extent_buffer(left, left->map_token, KM_USER1);
		left->map_token = NULL;
	}
2679 2680

	/* fixup right node */
2681
	if (push_items > right_nritems) {
C
Chris Mason 已提交
2682 2683
		printk(KERN_CRIT "push items %d nr %u\n", push_items,
		       right_nritems);
2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695
		WARN_ON(1);
	}

	if (push_items < right_nritems) {
		push_space = btrfs_item_offset_nr(right, push_items - 1) -
						  leaf_data_end(root, right);
		memmove_extent_buffer(right, btrfs_leaf_data(right) +
				      BTRFS_LEAF_DATA_SIZE(root) - push_space,
				      btrfs_leaf_data(right) +
				      leaf_data_end(root, right), push_space);

		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
2696 2697 2698
			      btrfs_item_nr_offset(push_items),
			     (btrfs_header_nritems(right) - push_items) *
			     sizeof(struct btrfs_item));
2699
	}
2700 2701
	right_nritems -= push_items;
	btrfs_set_header_nritems(right, right_nritems);
C
Chris Mason 已提交
2702
	push_space = BTRFS_LEAF_DATA_SIZE(root);
2703 2704
	for (i = 0; i < right_nritems; i++) {
		item = btrfs_item_nr(right, i);
2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719

		if (!right->map_token) {
			map_extent_buffer(right, (unsigned long)item,
					sizeof(struct btrfs_item),
					&right->map_token, &right->kaddr,
					&right->map_start, &right->map_len,
					KM_USER1);
		}

		push_space = push_space - btrfs_item_size(right, item);
		btrfs_set_item_offset(right, item, push_space);
	}
	if (right->map_token) {
		unmap_extent_buffer(right, right->map_token, KM_USER1);
		right->map_token = NULL;
2720
	}
2721

2722
	btrfs_mark_buffer_dirty(left);
2723 2724
	if (right_nritems)
		btrfs_mark_buffer_dirty(right);
2725

Z
Zheng Yan 已提交
2726 2727 2728 2729
	ret = btrfs_update_ref(trans, root, right, left,
			       old_left_nritems, push_items);
	BUG_ON(ret);

2730 2731
	btrfs_item_key(right, &disk_key, 0);
	wret = fixup_low_keys(trans, root, path, &disk_key, 1);
C
Chris Mason 已提交
2732 2733
	if (wret)
		ret = wret;
2734 2735 2736 2737

	/* then fixup the leaf pointer in the path */
	if (path->slots[0] < push_items) {
		path->slots[0] += old_left_nritems;
2738 2739 2740
		if (btrfs_header_nritems(path->nodes[0]) == 0)
			clean_tree_block(trans, root, path->nodes[0]);
		btrfs_tree_unlock(path->nodes[0]);
2741 2742
		free_extent_buffer(path->nodes[0]);
		path->nodes[0] = left;
2743 2744
		path->slots[1] -= 1;
	} else {
2745
		btrfs_tree_unlock(left);
2746
		free_extent_buffer(left);
2747 2748
		path->slots[0] -= push_items;
	}
2749
	BUG_ON(path->slots[0] < 0);
C
Chris Mason 已提交
2750
	return ret;
2751 2752 2753 2754
out:
	btrfs_tree_unlock(left);
	free_extent_buffer(left);
	return ret;
2755 2756
}

C
Chris Mason 已提交
2757 2758 2759
/*
 * split the path's leaf in two, making sure there is at least data_size
 * available for the resulting leaf level of the path.
C
Chris Mason 已提交
2760 2761
 *
 * returns 0 if all went well and < 0 on failure.
C
Chris Mason 已提交
2762
 */
2763 2764 2765 2766 2767
static noinline int split_leaf(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root,
			       struct btrfs_key *ins_key,
			       struct btrfs_path *path, int data_size,
			       int extend)
2768
{
2769
	struct extent_buffer *l;
2770
	u32 nritems;
2771 2772
	int mid;
	int slot;
2773
	struct extent_buffer *right;
2774 2775 2776
	int data_copy_size;
	int rt_data_off;
	int i;
2777
	int ret = 0;
C
Chris Mason 已提交
2778
	int wret;
2779 2780
	int double_split;
	int num_doubles = 0;
2781
	struct btrfs_disk_key disk_key;
C
Chris Mason 已提交
2782

C
Chris Mason 已提交
2783
	/* first try to make some room by pushing left and right */
2784
	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
2785
		wret = push_leaf_right(trans, root, path, data_size, 0);
C
Chris Mason 已提交
2786
		if (wret < 0)
C
Chris Mason 已提交
2787
			return wret;
2788
		if (wret) {
2789
			wret = push_leaf_left(trans, root, path, data_size, 0);
2790 2791 2792 2793
			if (wret < 0)
				return wret;
		}
		l = path->nodes[0];
C
Chris Mason 已提交
2794

2795
		/* did the pushes work? */
2796
		if (btrfs_leaf_free_space(root, l) >= data_size)
2797
			return 0;
2798
	}
C
Chris Mason 已提交
2799

C
Chris Mason 已提交
2800
	if (!path->nodes[1]) {
2801
		ret = insert_new_root(trans, root, path, 1);
C
Chris Mason 已提交
2802 2803 2804
		if (ret)
			return ret;
	}
2805 2806 2807
again:
	double_split = 0;
	l = path->nodes[0];
2808
	slot = path->slots[0];
2809
	nritems = btrfs_header_nritems(l);
C
Chris Mason 已提交
2810
	mid = (nritems + 1) / 2;
2811

2812
	right = btrfs_alloc_free_block(trans, root, root->leafsize,
Z
Zheng Yan 已提交
2813 2814 2815
					path->nodes[1]->start,
					root->root_key.objectid,
					trans->transid, 0, l->start, 0);
2816 2817
	if (IS_ERR(right)) {
		BUG_ON(1);
2818
		return PTR_ERR(right);
2819
	}
2820 2821

	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2822
	btrfs_set_header_bytenr(right, right->start);
2823 2824 2825 2826 2827 2828
	btrfs_set_header_generation(right, trans->transid);
	btrfs_set_header_owner(right, root->root_key.objectid);
	btrfs_set_header_level(right, 0);
	write_extent_buffer(right, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(right),
			    BTRFS_FSID_SIZE);
2829 2830 2831 2832

	write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
			    (unsigned long)btrfs_header_chunk_tree_uuid(right),
			    BTRFS_UUID_SIZE);
2833 2834
	if (mid <= slot) {
		if (nritems == 1 ||
2835
		    leaf_space_used(l, mid, nritems - mid) + data_size >
2836 2837 2838
			BTRFS_LEAF_DATA_SIZE(root)) {
			if (slot >= nritems) {
				btrfs_cpu_key_to_disk(&disk_key, ins_key);
2839
				btrfs_set_header_nritems(right, 0);
2840
				wret = insert_ptr(trans, root, path,
2841
						  &disk_key, right->start,
2842 2843 2844
						  path->slots[1] + 1, 1);
				if (wret)
					ret = wret;
2845 2846

				btrfs_tree_unlock(path->nodes[0]);
2847 2848
				free_extent_buffer(path->nodes[0]);
				path->nodes[0] = right;
2849 2850
				path->slots[0] = 0;
				path->slots[1] += 1;
2851
				btrfs_mark_buffer_dirty(right);
2852 2853 2854
				return ret;
			}
			mid = slot;
2855 2856
			if (mid != nritems &&
			    leaf_space_used(l, mid, nritems - mid) +
2857
			    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2858 2859
				double_split = 1;
			}
2860 2861
		}
	} else {
2862
		if (leaf_space_used(l, 0, mid) + data_size >
2863
			BTRFS_LEAF_DATA_SIZE(root)) {
2864
			if (!extend && data_size && slot == 0) {
2865
				btrfs_cpu_key_to_disk(&disk_key, ins_key);
2866
				btrfs_set_header_nritems(right, 0);
2867 2868
				wret = insert_ptr(trans, root, path,
						  &disk_key,
2869
						  right->start,
2870
						  path->slots[1], 1);
2871 2872
				if (wret)
					ret = wret;
2873
				btrfs_tree_unlock(path->nodes[0]);
2874 2875
				free_extent_buffer(path->nodes[0]);
				path->nodes[0] = right;
2876
				path->slots[0] = 0;
2877 2878
				if (path->slots[1] == 0) {
					wret = fixup_low_keys(trans, root,
C
Chris Mason 已提交
2879
						      path, &disk_key, 1);
2880 2881 2882
					if (wret)
						ret = wret;
				}
2883
				btrfs_mark_buffer_dirty(right);
2884
				return ret;
2885
			} else if ((extend || !data_size) && slot == 0) {
2886 2887 2888 2889 2890
				mid = 1;
			} else {
				mid = slot;
				if (mid != nritems &&
				    leaf_space_used(l, mid, nritems - mid) +
2891
				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2892 2893
					double_split = 1;
				}
2894
			}
2895 2896
		}
	}
2897 2898 2899 2900 2901 2902 2903 2904 2905
	nritems = nritems - mid;
	btrfs_set_header_nritems(right, nritems);
	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);

	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
			   btrfs_item_nr_offset(mid),
			   nritems * sizeof(struct btrfs_item));

	copy_extent_buffer(right, l,
C
Chris Mason 已提交
2906 2907 2908
		     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
		     data_copy_size, btrfs_leaf_data(l) +
		     leaf_data_end(root, l), data_copy_size);
2909

C
Chris Mason 已提交
2910
	rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2911
		      btrfs_item_end_nr(l, mid);
C
Chris Mason 已提交
2912

2913 2914
	for (i = 0; i < nritems; i++) {
		struct btrfs_item *item = btrfs_item_nr(right, i);
2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925
		u32 ioff;

		if (!right->map_token) {
			map_extent_buffer(right, (unsigned long)item,
					sizeof(struct btrfs_item),
					&right->map_token, &right->kaddr,
					&right->map_start, &right->map_len,
					KM_USER1);
		}

		ioff = btrfs_item_offset(right, item);
2926
		btrfs_set_item_offset(right, item, ioff + rt_data_off);
C
Chris Mason 已提交
2927
	}
C
Chris Mason 已提交
2928

2929 2930 2931 2932 2933
	if (right->map_token) {
		unmap_extent_buffer(right, right->map_token, KM_USER1);
		right->map_token = NULL;
	}

2934
	btrfs_set_header_nritems(l, mid);
C
Chris Mason 已提交
2935
	ret = 0;
2936
	btrfs_item_key(right, &disk_key, 0);
2937 2938
	wret = insert_ptr(trans, root, path, &disk_key, right->start,
			  path->slots[1] + 1, 1);
C
Chris Mason 已提交
2939 2940
	if (wret)
		ret = wret;
2941 2942 2943

	btrfs_mark_buffer_dirty(right);
	btrfs_mark_buffer_dirty(l);
2944
	BUG_ON(path->slots[0] != slot);
2945

Z
Zheng Yan 已提交
2946 2947 2948
	ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
	BUG_ON(ret);

2949
	if (mid <= slot) {
2950
		btrfs_tree_unlock(path->nodes[0]);
2951 2952
		free_extent_buffer(path->nodes[0]);
		path->nodes[0] = right;
2953 2954
		path->slots[0] -= mid;
		path->slots[1] += 1;
2955 2956
	} else {
		btrfs_tree_unlock(right);
2957
		free_extent_buffer(right);
2958
	}
2959

2960
	BUG_ON(path->slots[0] < 0);
2961

2962 2963 2964 2965
	if (double_split) {
		BUG_ON(num_doubles != 0);
		num_doubles++;
		goto again;
2966
	}
2967 2968 2969
	return ret;
}

2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023
/*
 * This function splits a single item into two items,
 * giving 'new_key' to the new item and splitting the
 * old one at split_offset (from the start of the item).
 *
 * The path may be released by this operation.  After
 * the split, the path is pointing to the old item.  The
 * new item is going to be in the same node as the old one.
 *
 * Note, the item being split must be smaller enough to live alone on
 * a tree block with room for one extra struct btrfs_item
 *
 * This allows us to split the item in place, keeping a lock on the
 * leaf the entire time.
 */
int btrfs_split_item(struct btrfs_trans_handle *trans,
		     struct btrfs_root *root,
		     struct btrfs_path *path,
		     struct btrfs_key *new_key,
		     unsigned long split_offset)
{
	u32 item_size;
	struct extent_buffer *leaf;
	struct btrfs_key orig_key;
	struct btrfs_item *item;
	struct btrfs_item *new_item;
	int ret = 0;
	int slot;
	u32 nritems;
	u32 orig_offset;
	struct btrfs_disk_key disk_key;
	char *buf;

	leaf = path->nodes[0];
	btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]);
	if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item))
		goto split;

	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
	btrfs_release_path(root, path);

	path->search_for_split = 1;
	path->keep_locks = 1;

	ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1);
	path->search_for_split = 0;

	/* if our item isn't there or got smaller, return now */
	if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0],
							path->slots[0])) {
		path->keep_locks = 0;
		return -EAGAIN;
	}

3024 3025
	ret = split_leaf(trans, root, &orig_key, path,
			 sizeof(struct btrfs_item), 1);
3026 3027 3028
	path->keep_locks = 0;
	BUG_ON(ret);

3029 3030 3031 3032 3033 3034
	/*
	 * make sure any changes to the path from split_leaf leave it
	 * in a blocking state
	 */
	btrfs_set_path_blocking(path);

3035
	leaf = path->nodes[0];
3036
	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093

split:
	item = btrfs_item_nr(leaf, path->slots[0]);
	orig_offset = btrfs_item_offset(leaf, item);
	item_size = btrfs_item_size(leaf, item);


	buf = kmalloc(item_size, GFP_NOFS);
	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
			    path->slots[0]), item_size);
	slot = path->slots[0] + 1;
	leaf = path->nodes[0];

	nritems = btrfs_header_nritems(leaf);

	if (slot != nritems) {
		/* shift the items */
		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
			      btrfs_item_nr_offset(slot),
			      (nritems - slot) * sizeof(struct btrfs_item));

	}

	btrfs_cpu_key_to_disk(&disk_key, new_key);
	btrfs_set_item_key(leaf, &disk_key, slot);

	new_item = btrfs_item_nr(leaf, slot);

	btrfs_set_item_offset(leaf, new_item, orig_offset);
	btrfs_set_item_size(leaf, new_item, item_size - split_offset);

	btrfs_set_item_offset(leaf, item,
			      orig_offset + item_size - split_offset);
	btrfs_set_item_size(leaf, item, split_offset);

	btrfs_set_header_nritems(leaf, nritems + 1);

	/* write the data for the start of the original item */
	write_extent_buffer(leaf, buf,
			    btrfs_item_ptr_offset(leaf, path->slots[0]),
			    split_offset);

	/* write the data for the new item */
	write_extent_buffer(leaf, buf + split_offset,
			    btrfs_item_ptr_offset(leaf, slot),
			    item_size - split_offset);
	btrfs_mark_buffer_dirty(leaf);

	ret = 0;
	if (btrfs_leaf_free_space(root, leaf) < 0) {
		btrfs_print_leaf(root, leaf);
		BUG();
	}
	kfree(buf);
	return ret;
}

C
Chris Mason 已提交
3094 3095 3096 3097 3098 3099
/*
 * make the item pointed to by the path smaller.  new_size indicates
 * how small to make it, and from_end tells us if we just chop bytes
 * off the end of the item or if we shift the item to chop bytes off
 * the front.
 */
C
Chris Mason 已提交
3100 3101 3102
int btrfs_truncate_item(struct btrfs_trans_handle *trans,
			struct btrfs_root *root,
			struct btrfs_path *path,
3103
			u32 new_size, int from_end)
C
Chris Mason 已提交
3104 3105 3106 3107
{
	int ret = 0;
	int slot;
	int slot_orig;
3108 3109
	struct extent_buffer *leaf;
	struct btrfs_item *item;
C
Chris Mason 已提交
3110 3111 3112 3113 3114 3115 3116 3117
	u32 nritems;
	unsigned int data_end;
	unsigned int old_data_start;
	unsigned int old_size;
	unsigned int size_diff;
	int i;

	slot_orig = path->slots[0];
3118
	leaf = path->nodes[0];
3119 3120 3121 3122 3123
	slot = path->slots[0];

	old_size = btrfs_item_size_nr(leaf, slot);
	if (old_size == new_size)
		return 0;
C
Chris Mason 已提交
3124

3125
	nritems = btrfs_header_nritems(leaf);
C
Chris Mason 已提交
3126 3127
	data_end = leaf_data_end(root, leaf);

3128
	old_data_start = btrfs_item_offset_nr(leaf, slot);
3129

C
Chris Mason 已提交
3130 3131 3132 3133 3134 3135 3136 3137 3138 3139
	size_diff = old_size - new_size;

	BUG_ON(slot < 0);
	BUG_ON(slot >= nritems);

	/*
	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
	 */
	/* first correct the data pointers */
	for (i = slot; i < nritems; i++) {
3140 3141
		u32 ioff;
		item = btrfs_item_nr(leaf, i);
3142 3143 3144 3145 3146 3147 3148 3149 3150

		if (!leaf->map_token) {
			map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
		}

3151 3152
		ioff = btrfs_item_offset(leaf, item);
		btrfs_set_item_offset(leaf, item, ioff + size_diff);
C
Chris Mason 已提交
3153
	}
3154 3155 3156 3157 3158 3159

	if (leaf->map_token) {
		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
		leaf->map_token = NULL;
	}

C
Chris Mason 已提交
3160
	/* shift the data */
3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183
	if (from_end) {
		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
			      data_end + size_diff, btrfs_leaf_data(leaf) +
			      data_end, old_data_start + new_size - data_end);
	} else {
		struct btrfs_disk_key disk_key;
		u64 offset;

		btrfs_item_key(leaf, &disk_key, slot);

		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
			unsigned long ptr;
			struct btrfs_file_extent_item *fi;

			fi = btrfs_item_ptr(leaf, slot,
					    struct btrfs_file_extent_item);
			fi = (struct btrfs_file_extent_item *)(
			     (unsigned long)fi - size_diff);

			if (btrfs_file_extent_type(leaf, fi) ==
			    BTRFS_FILE_EXTENT_INLINE) {
				ptr = btrfs_item_ptr_offset(leaf, slot);
				memmove_extent_buffer(leaf, ptr,
C
Chris Mason 已提交
3184 3185
				      (unsigned long)fi,
				      offsetof(struct btrfs_file_extent_item,
3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199
						 disk_bytenr));
			}
		}

		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
			      data_end + size_diff, btrfs_leaf_data(leaf) +
			      data_end, old_data_start - data_end);

		offset = btrfs_disk_key_offset(&disk_key);
		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
		btrfs_set_item_key(leaf, &disk_key, slot);
		if (slot == 0)
			fixup_low_keys(trans, root, path, &disk_key, 1);
	}
3200 3201 3202 3203

	item = btrfs_item_nr(leaf, slot);
	btrfs_set_item_size(leaf, item, new_size);
	btrfs_mark_buffer_dirty(leaf);
C
Chris Mason 已提交
3204 3205

	ret = 0;
3206 3207
	if (btrfs_leaf_free_space(root, leaf) < 0) {
		btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
3208
		BUG();
3209
	}
C
Chris Mason 已提交
3210 3211 3212
	return ret;
}

C
Chris Mason 已提交
3213 3214 3215
/*
 * make the item pointed to by the path bigger, data_size is the new size.
 */
3216 3217 3218
int btrfs_extend_item(struct btrfs_trans_handle *trans,
		      struct btrfs_root *root, struct btrfs_path *path,
		      u32 data_size)
3219 3220 3221 3222
{
	int ret = 0;
	int slot;
	int slot_orig;
3223 3224
	struct extent_buffer *leaf;
	struct btrfs_item *item;
3225 3226 3227 3228 3229 3230 3231
	u32 nritems;
	unsigned int data_end;
	unsigned int old_data;
	unsigned int old_size;
	int i;

	slot_orig = path->slots[0];
3232
	leaf = path->nodes[0];
3233

3234
	nritems = btrfs_header_nritems(leaf);
3235 3236
	data_end = leaf_data_end(root, leaf);

3237 3238
	if (btrfs_leaf_free_space(root, leaf) < data_size) {
		btrfs_print_leaf(root, leaf);
3239
		BUG();
3240
	}
3241
	slot = path->slots[0];
3242
	old_data = btrfs_item_end_nr(leaf, slot);
3243 3244

	BUG_ON(slot < 0);
3245 3246
	if (slot >= nritems) {
		btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
3247 3248
		printk(KERN_CRIT "slot %d too large, nritems %d\n",
		       slot, nritems);
3249 3250
		BUG_ON(1);
	}
3251 3252 3253 3254 3255 3256

	/*
	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
	 */
	/* first correct the data pointers */
	for (i = slot; i < nritems; i++) {
3257 3258
		u32 ioff;
		item = btrfs_item_nr(leaf, i);
3259 3260 3261 3262 3263 3264 3265 3266

		if (!leaf->map_token) {
			map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
		}
3267 3268
		ioff = btrfs_item_offset(leaf, item);
		btrfs_set_item_offset(leaf, item, ioff - data_size);
3269
	}
3270

3271 3272 3273 3274 3275
	if (leaf->map_token) {
		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
		leaf->map_token = NULL;
	}

3276
	/* shift the data */
3277
	memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3278 3279
		      data_end - data_size, btrfs_leaf_data(leaf) +
		      data_end, old_data - data_end);
3280

3281
	data_end = old_data;
3282 3283 3284 3285
	old_size = btrfs_item_size_nr(leaf, slot);
	item = btrfs_item_nr(leaf, slot);
	btrfs_set_item_size(leaf, item, old_size + data_size);
	btrfs_mark_buffer_dirty(leaf);
3286 3287

	ret = 0;
3288 3289
	if (btrfs_leaf_free_space(root, leaf) < 0) {
		btrfs_print_leaf(root, leaf);
3290
		BUG();
3291
	}
3292 3293 3294
	return ret;
}

3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317
/*
 * Given a key and some data, insert items into the tree.
 * This does all the path init required, making room in the tree if needed.
 * Returns the number of keys that were inserted.
 */
int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root,
			    struct btrfs_path *path,
			    struct btrfs_key *cpu_key, u32 *data_size,
			    int nr)
{
	struct extent_buffer *leaf;
	struct btrfs_item *item;
	int ret = 0;
	int slot;
	int i;
	u32 nritems;
	u32 total_data = 0;
	u32 total_size = 0;
	unsigned int data_end;
	struct btrfs_disk_key disk_key;
	struct btrfs_key found_key;

3318 3319 3320 3321 3322 3323
	for (i = 0; i < nr; i++) {
		if (total_size + data_size[i] + sizeof(struct btrfs_item) >
		    BTRFS_LEAF_DATA_SIZE(root)) {
			break;
			nr = i;
		}
3324
		total_data += data_size[i];
3325 3326 3327
		total_size += data_size[i] + sizeof(struct btrfs_item);
	}
	BUG_ON(nr == 0);
3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369

	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
	if (ret == 0)
		return -EEXIST;
	if (ret < 0)
		goto out;

	leaf = path->nodes[0];

	nritems = btrfs_header_nritems(leaf);
	data_end = leaf_data_end(root, leaf);

	if (btrfs_leaf_free_space(root, leaf) < total_size) {
		for (i = nr; i >= 0; i--) {
			total_data -= data_size[i];
			total_size -= data_size[i] + sizeof(struct btrfs_item);
			if (total_size < btrfs_leaf_free_space(root, leaf))
				break;
		}
		nr = i;
	}

	slot = path->slots[0];
	BUG_ON(slot < 0);

	if (slot != nritems) {
		unsigned int old_data = btrfs_item_end_nr(leaf, slot);

		item = btrfs_item_nr(leaf, slot);
		btrfs_item_key_to_cpu(leaf, &found_key, slot);

		/* figure out how many keys we can insert in here */
		total_data = data_size[0];
		for (i = 1; i < nr; i++) {
			if (comp_cpu_keys(&found_key, cpu_key + i) <= 0)
				break;
			total_data += data_size[i];
		}
		nr = i;

		if (old_data < data_end) {
			btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
3370
			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446
			       slot, old_data, data_end);
			BUG_ON(1);
		}
		/*
		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
		 */
		/* first correct the data pointers */
		WARN_ON(leaf->map_token);
		for (i = slot; i < nritems; i++) {
			u32 ioff;

			item = btrfs_item_nr(leaf, i);
			if (!leaf->map_token) {
				map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
			}

			ioff = btrfs_item_offset(leaf, item);
			btrfs_set_item_offset(leaf, item, ioff - total_data);
		}
		if (leaf->map_token) {
			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
			leaf->map_token = NULL;
		}

		/* shift the items */
		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
			      btrfs_item_nr_offset(slot),
			      (nritems - slot) * sizeof(struct btrfs_item));

		/* shift the data */
		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
			      data_end - total_data, btrfs_leaf_data(leaf) +
			      data_end, old_data - data_end);
		data_end = old_data;
	} else {
		/*
		 * this sucks but it has to be done, if we are inserting at
		 * the end of the leaf only insert 1 of the items, since we
		 * have no way of knowing whats on the next leaf and we'd have
		 * to drop our current locks to figure it out
		 */
		nr = 1;
	}

	/* setup the item for the new data */
	for (i = 0; i < nr; i++) {
		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
		btrfs_set_item_key(leaf, &disk_key, slot + i);
		item = btrfs_item_nr(leaf, slot + i);
		btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
		data_end -= data_size[i];
		btrfs_set_item_size(leaf, item, data_size[i]);
	}
	btrfs_set_header_nritems(leaf, nritems + nr);
	btrfs_mark_buffer_dirty(leaf);

	ret = 0;
	if (slot == 0) {
		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
	}

	if (btrfs_leaf_free_space(root, leaf) < 0) {
		btrfs_print_leaf(root, leaf);
		BUG();
	}
out:
	if (!ret)
		ret = nr;
	return ret;
}

C
Chris Mason 已提交
3447
/*
C
Chris Mason 已提交
3448
 * Given a key and some data, insert items into the tree.
C
Chris Mason 已提交
3449 3450
 * This does all the path init required, making room in the tree if needed.
 */
3451
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3452 3453
			    struct btrfs_root *root,
			    struct btrfs_path *path,
3454 3455
			    struct btrfs_key *cpu_key, u32 *data_size,
			    int nr)
3456
{
3457 3458
	struct extent_buffer *leaf;
	struct btrfs_item *item;
C
Chris Mason 已提交
3459
	int ret = 0;
3460
	int slot;
3461
	int slot_orig;
3462
	int i;
3463
	u32 nritems;
3464 3465
	u32 total_size = 0;
	u32 total_data = 0;
3466
	unsigned int data_end;
C
Chris Mason 已提交
3467 3468
	struct btrfs_disk_key disk_key;

C
Chris Mason 已提交
3469
	for (i = 0; i < nr; i++)
3470
		total_data += data_size[i];
3471

3472
	total_size = total_data + (nr * sizeof(struct btrfs_item));
3473
	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
J
Josef Bacik 已提交
3474
	if (ret == 0)
3475
		return -EEXIST;
3476 3477
	if (ret < 0)
		goto out;
3478

3479
	slot_orig = path->slots[0];
3480
	leaf = path->nodes[0];
C
Chris Mason 已提交
3481

3482
	nritems = btrfs_header_nritems(leaf);
C
Chris Mason 已提交
3483
	data_end = leaf_data_end(root, leaf);
3484

3485
	if (btrfs_leaf_free_space(root, leaf) < total_size) {
3486
		btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
3487
		printk(KERN_CRIT "not enough freespace need %u have %d\n",
3488
		       total_size, btrfs_leaf_free_space(root, leaf));
3489
		BUG();
3490
	}
3491

3492
	slot = path->slots[0];
3493
	BUG_ON(slot < 0);
3494

3495
	if (slot != nritems) {
3496
		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3497

3498 3499
		if (old_data < data_end) {
			btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
3500
			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3501 3502 3503
			       slot, old_data, data_end);
			BUG_ON(1);
		}
3504 3505 3506 3507
		/*
		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
		 */
		/* first correct the data pointers */
3508
		WARN_ON(leaf->map_token);
C
Chris Mason 已提交
3509
		for (i = slot; i < nritems; i++) {
3510
			u32 ioff;
3511

3512
			item = btrfs_item_nr(leaf, i);
3513 3514 3515 3516 3517 3518 3519 3520
			if (!leaf->map_token) {
				map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
			}

3521
			ioff = btrfs_item_offset(leaf, item);
3522
			btrfs_set_item_offset(leaf, item, ioff - total_data);
C
Chris Mason 已提交
3523
		}
3524 3525 3526 3527
		if (leaf->map_token) {
			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
			leaf->map_token = NULL;
		}
3528 3529

		/* shift the items */
3530
		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3531
			      btrfs_item_nr_offset(slot),
C
Chris Mason 已提交
3532
			      (nritems - slot) * sizeof(struct btrfs_item));
3533 3534

		/* shift the data */
3535
		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3536
			      data_end - total_data, btrfs_leaf_data(leaf) +
C
Chris Mason 已提交
3537
			      data_end, old_data - data_end);
3538 3539
		data_end = old_data;
	}
3540

3541
	/* setup the item for the new data */
3542 3543 3544 3545 3546 3547 3548 3549 3550
	for (i = 0; i < nr; i++) {
		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
		btrfs_set_item_key(leaf, &disk_key, slot + i);
		item = btrfs_item_nr(leaf, slot + i);
		btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
		data_end -= data_size[i];
		btrfs_set_item_size(leaf, item, data_size[i]);
	}
	btrfs_set_header_nritems(leaf, nritems + nr);
3551
	btrfs_mark_buffer_dirty(leaf);
C
Chris Mason 已提交
3552 3553

	ret = 0;
3554 3555
	if (slot == 0) {
		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3556
		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3557
	}
C
Chris Mason 已提交
3558

3559 3560
	if (btrfs_leaf_free_space(root, leaf) < 0) {
		btrfs_print_leaf(root, leaf);
3561
		BUG();
3562
	}
3563
out:
3564
	btrfs_unlock_up_safe(path, 1);
3565 3566 3567 3568 3569 3570 3571
	return ret;
}

/*
 * Given a key and some data, insert an item into the tree.
 * This does all the path init required, making room in the tree if needed.
 */
3572 3573 3574
int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
		      *root, struct btrfs_key *cpu_key, void *data, u32
		      data_size)
3575 3576
{
	int ret = 0;
C
Chris Mason 已提交
3577
	struct btrfs_path *path;
3578 3579
	struct extent_buffer *leaf;
	unsigned long ptr;
3580

C
Chris Mason 已提交
3581 3582 3583
	path = btrfs_alloc_path();
	BUG_ON(!path);
	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3584
	if (!ret) {
3585 3586 3587 3588
		leaf = path->nodes[0];
		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
		write_extent_buffer(leaf, data, ptr, data_size);
		btrfs_mark_buffer_dirty(leaf);
3589
	}
C
Chris Mason 已提交
3590
	btrfs_free_path(path);
C
Chris Mason 已提交
3591
	return ret;
3592 3593
}

C
Chris Mason 已提交
3594
/*
C
Chris Mason 已提交
3595
 * delete the pointer from a given node.
C
Chris Mason 已提交
3596
 *
C
Chris Mason 已提交
3597 3598
 * the tree should have been previously balanced so the deletion does not
 * empty a node.
C
Chris Mason 已提交
3599
 */
3600 3601
static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		   struct btrfs_path *path, int level, int slot)
3602
{
3603
	struct extent_buffer *parent = path->nodes[level];
3604
	u32 nritems;
C
Chris Mason 已提交
3605
	int ret = 0;
3606
	int wret;
3607

3608
	nritems = btrfs_header_nritems(parent);
C
Chris Mason 已提交
3609
	if (slot != nritems - 1) {
3610 3611 3612
		memmove_extent_buffer(parent,
			      btrfs_node_key_ptr_offset(slot),
			      btrfs_node_key_ptr_offset(slot + 1),
C
Chris Mason 已提交
3613 3614
			      sizeof(struct btrfs_key_ptr) *
			      (nritems - slot - 1));
3615
	}
3616
	nritems--;
3617
	btrfs_set_header_nritems(parent, nritems);
3618
	if (nritems == 0 && parent == root->node) {
3619
		BUG_ON(btrfs_header_level(root->node) != 1);
3620
		/* just turn the root into a leaf and break */
3621
		btrfs_set_header_level(root->node, 0);
3622
	} else if (slot == 0) {
3623 3624 3625 3626
		struct btrfs_disk_key disk_key;

		btrfs_node_key(parent, &disk_key, 0);
		wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
C
Chris Mason 已提交
3627 3628
		if (wret)
			ret = wret;
3629
	}
C
Chris Mason 已提交
3630
	btrfs_mark_buffer_dirty(parent);
C
Chris Mason 已提交
3631
	return ret;
3632 3633
}

3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651
/*
 * a helper function to delete the leaf pointed to by path->slots[1] and
 * path->nodes[1].  bytenr is the node block pointer, but since the callers
 * already know it, it is faster to have them pass it down than to
 * read it out of the node again.
 *
 * This deletes the pointer in path->nodes[1] and frees the leaf
 * block extent.  zero is returned if it all worked out, < 0 otherwise.
 *
 * The path must have already been setup for deleting the leaf, including
 * all the proper balancing.  path->nodes[1] must be locked.
 */
noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root,
			    struct btrfs_path *path, u64 bytenr)
{
	int ret;
	u64 root_gen = btrfs_header_generation(path->nodes[1]);
3652 3653
	u64 parent_start = path->nodes[1]->start;
	u64 parent_owner = btrfs_header_owner(path->nodes[1]);
3654 3655 3656 3657 3658

	ret = del_ptr(trans, root, path, 1, path->slots[1]);
	if (ret)
		return ret;

3659 3660 3661 3662 3663 3664
	/*
	 * btrfs_free_extent is expensive, we want to make sure we
	 * aren't holding any locks when we call it
	 */
	btrfs_unlock_up_safe(path, 0);

3665 3666
	ret = btrfs_free_extent(trans, root, bytenr,
				btrfs_level_size(root, 0),
3667
				parent_start, parent_owner,
3668
				root_gen, 0, 1);
3669 3670
	return ret;
}
C
Chris Mason 已提交
3671 3672 3673 3674
/*
 * delete the item at the leaf level in path.  If that empties
 * the leaf, remove it from the tree
 */
3675 3676
int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		    struct btrfs_path *path, int slot, int nr)
3677
{
3678 3679
	struct extent_buffer *leaf;
	struct btrfs_item *item;
3680 3681
	int last_off;
	int dsize = 0;
C
Chris Mason 已提交
3682 3683
	int ret = 0;
	int wret;
3684
	int i;
3685
	u32 nritems;
3686

3687
	leaf = path->nodes[0];
3688 3689 3690 3691 3692
	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);

	for (i = 0; i < nr; i++)
		dsize += btrfs_item_size_nr(leaf, slot + i);

3693
	nritems = btrfs_header_nritems(leaf);
3694

3695
	if (slot + nr != nritems) {
C
Chris Mason 已提交
3696
		int data_end = leaf_data_end(root, leaf);
3697 3698

		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
C
Chris Mason 已提交
3699 3700
			      data_end + dsize,
			      btrfs_leaf_data(leaf) + data_end,
3701
			      last_off - data_end);
3702

3703
		for (i = slot + nr; i < nritems; i++) {
3704
			u32 ioff;
3705

3706
			item = btrfs_item_nr(leaf, i);
3707 3708 3709 3710 3711 3712 3713
			if (!leaf->map_token) {
				map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
			}
3714 3715
			ioff = btrfs_item_offset(leaf, item);
			btrfs_set_item_offset(leaf, item, ioff + dsize);
C
Chris Mason 已提交
3716
		}
3717 3718 3719 3720 3721 3722

		if (leaf->map_token) {
			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
			leaf->map_token = NULL;
		}

3723
		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
3724
			      btrfs_item_nr_offset(slot + nr),
C
Chris Mason 已提交
3725
			      sizeof(struct btrfs_item) *
3726
			      (nritems - slot - nr));
3727
	}
3728 3729
	btrfs_set_header_nritems(leaf, nritems - nr);
	nritems -= nr;
3730

C
Chris Mason 已提交
3731
	/* delete the leaf if we've emptied it */
3732
	if (nritems == 0) {
3733 3734
		if (leaf == root->node) {
			btrfs_set_header_level(leaf, 0);
3735
		} else {
3736 3737
			ret = btrfs_del_leaf(trans, root, path, leaf->start);
			BUG_ON(ret);
3738
		}
3739
	} else {
3740
		int used = leaf_space_used(leaf, 0, nritems);
C
Chris Mason 已提交
3741
		if (slot == 0) {
3742 3743 3744
			struct btrfs_disk_key disk_key;

			btrfs_item_key(leaf, &disk_key, 0);
3745
			wret = fixup_low_keys(trans, root, path,
3746
					      &disk_key, 1);
C
Chris Mason 已提交
3747 3748 3749 3750
			if (wret)
				ret = wret;
		}

C
Chris Mason 已提交
3751
		/* delete the leaf if it is mostly empty */
3752
		if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
3753 3754 3755 3756
			/* push_leaf_left fixes the path.
			 * make sure the path still points to our leaf
			 * for possible call to del_ptr below
			 */
3757
			slot = path->slots[1];
3758 3759
			extent_buffer_get(leaf);

3760
			wret = push_leaf_left(trans, root, path, 1, 1);
3761
			if (wret < 0 && wret != -ENOSPC)
C
Chris Mason 已提交
3762
				ret = wret;
3763 3764 3765

			if (path->nodes[0] == leaf &&
			    btrfs_header_nritems(leaf)) {
3766
				wret = push_leaf_right(trans, root, path, 1, 1);
3767
				if (wret < 0 && wret != -ENOSPC)
C
Chris Mason 已提交
3768 3769
					ret = wret;
			}
3770 3771

			if (btrfs_header_nritems(leaf) == 0) {
3772
				path->slots[1] = slot;
C
Chris Mason 已提交
3773 3774
				ret = btrfs_del_leaf(trans, root, path,
						     leaf->start);
3775
				BUG_ON(ret);
3776
				free_extent_buffer(leaf);
C
Chris Mason 已提交
3777
			} else {
3778 3779 3780 3781 3782 3783 3784
				/* if we're still in the path, make sure
				 * we're dirty.  Otherwise, one of the
				 * push_leaf functions must have already
				 * dirtied this buffer
				 */
				if (path->nodes[0] == leaf)
					btrfs_mark_buffer_dirty(leaf);
3785
				free_extent_buffer(leaf);
3786
			}
3787
		} else {
3788
			btrfs_mark_buffer_dirty(leaf);
3789 3790
		}
	}
C
Chris Mason 已提交
3791
	return ret;
3792 3793
}

3794
/*
3795
 * search the tree again to find a leaf with lesser keys
3796 3797
 * returns 0 if it found something or 1 if there are no lesser leaves.
 * returns < 0 on io errors.
C
Chris Mason 已提交
3798 3799 3800
 *
 * This may release the path, and so you may lose any locks held at the
 * time you call it.
3801 3802 3803
 */
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
{
3804 3805 3806
	struct btrfs_key key;
	struct btrfs_disk_key found_key;
	int ret;
3807

3808
	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
3809

3810 3811 3812 3813 3814 3815 3816 3817
	if (key.offset > 0)
		key.offset--;
	else if (key.type > 0)
		key.type--;
	else if (key.objectid > 0)
		key.objectid--;
	else
		return 1;
3818

3819 3820 3821 3822 3823 3824 3825 3826 3827
	btrfs_release_path(root, path);
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0)
		return ret;
	btrfs_item_key(path->nodes[0], &found_key, 0);
	ret = comp_keys(&found_key, &key);
	if (ret < 0)
		return 0;
	return 1;
3828 3829
}

3830 3831 3832
/*
 * A helper function to walk down the tree starting at min_key, and looking
 * for nodes or leaves that are either in cache or have a minimum
C
Chris Mason 已提交
3833
 * transaction id.  This is used by the btree defrag code, and tree logging
3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844
 *
 * This does not cow, but it does stuff the starting key it finds back
 * into min_key, so you can call btrfs_search_slot with cow=1 on the
 * key and get a writable path.
 *
 * This does lock as it descends, and path->keep_locks should be set
 * to 1 by the caller.
 *
 * This honors path->lowest_level to prevent descent past a given level
 * of the tree.
 *
C
Chris Mason 已提交
3845 3846 3847 3848
 * min_trans indicates the oldest transaction that you are interested
 * in walking through.  Any nodes or leaves older than min_trans are
 * skipped over (without reading them).
 *
3849 3850 3851 3852
 * returns zero if something useful was found, < 0 on error and 1 if there
 * was nothing in the tree that matched the search criteria.
 */
int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
3853
			 struct btrfs_key *max_key,
3854 3855 3856 3857 3858 3859
			 struct btrfs_path *path, int cache_only,
			 u64 min_trans)
{
	struct extent_buffer *cur;
	struct btrfs_key found_key;
	int slot;
3860
	int sret;
3861 3862 3863 3864
	u32 nritems;
	int level;
	int ret = 1;

3865
	WARN_ON(!path->keep_locks);
3866 3867 3868
again:
	cur = btrfs_lock_root_node(root);
	level = btrfs_header_level(cur);
3869
	WARN_ON(path->nodes[level]);
3870 3871 3872 3873 3874 3875 3876
	path->nodes[level] = cur;
	path->locks[level] = 1;

	if (btrfs_header_generation(cur) < min_trans) {
		ret = 1;
		goto out;
	}
C
Chris Mason 已提交
3877
	while (1) {
3878 3879
		nritems = btrfs_header_nritems(cur);
		level = btrfs_header_level(cur);
3880
		sret = bin_search(cur, min_key, level, &slot);
3881

3882 3883
		/* at the lowest level, we're done, setup the path and exit */
		if (level == path->lowest_level) {
3884 3885
			if (slot >= nritems)
				goto find_next_key;
3886 3887 3888 3889 3890
			ret = 0;
			path->slots[level] = slot;
			btrfs_item_key_to_cpu(cur, &found_key, slot);
			goto out;
		}
3891 3892
		if (sret && slot > 0)
			slot--;
3893 3894 3895 3896 3897
		/*
		 * check this node pointer against the cache_only and
		 * min_trans parameters.  If it isn't in cache or is too
		 * old, skip to the next one.
		 */
C
Chris Mason 已提交
3898
		while (slot < nritems) {
3899 3900 3901
			u64 blockptr;
			u64 gen;
			struct extent_buffer *tmp;
3902 3903
			struct btrfs_disk_key disk_key;

3904 3905 3906 3907 3908 3909 3910 3911 3912
			blockptr = btrfs_node_blockptr(cur, slot);
			gen = btrfs_node_ptr_generation(cur, slot);
			if (gen < min_trans) {
				slot++;
				continue;
			}
			if (!cache_only)
				break;

3913 3914 3915 3916 3917 3918 3919 3920
			if (max_key) {
				btrfs_node_key(cur, &disk_key, slot);
				if (comp_keys(&disk_key, max_key) >= 0) {
					ret = 1;
					goto out;
				}
			}

3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931
			tmp = btrfs_find_tree_block(root, blockptr,
					    btrfs_level_size(root, level - 1));

			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
				free_extent_buffer(tmp);
				break;
			}
			if (tmp)
				free_extent_buffer(tmp);
			slot++;
		}
3932
find_next_key:
3933 3934 3935 3936 3937
		/*
		 * we didn't find a candidate key in this node, walk forward
		 * and find another one
		 */
		if (slot >= nritems) {
3938
			path->slots[level] = slot;
3939
			btrfs_set_path_blocking(path);
3940
			sret = btrfs_find_next_key(root, path, min_key, level,
3941
						  cache_only, min_trans);
3942
			if (sret == 0) {
3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956
				btrfs_release_path(root, path);
				goto again;
			} else {
				goto out;
			}
		}
		/* save our key for returning back */
		btrfs_node_key_to_cpu(cur, &found_key, slot);
		path->slots[level] = slot;
		if (level == path->lowest_level) {
			ret = 0;
			unlock_up(path, level, 1);
			goto out;
		}
3957
		btrfs_set_path_blocking(path);
3958 3959 3960
		cur = read_node_slot(root, cur, slot);

		btrfs_tree_lock(cur);
3961

3962 3963 3964
		path->locks[level - 1] = 1;
		path->nodes[level - 1] = cur;
		unlock_up(path, level, 1);
3965
		btrfs_clear_path_blocking(path, NULL);
3966 3967 3968 3969
	}
out:
	if (ret == 0)
		memcpy(min_key, &found_key, sizeof(found_key));
3970
	btrfs_set_path_blocking(path);
3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985
	return ret;
}

/*
 * this is similar to btrfs_next_leaf, but does not try to preserve
 * and fixup the path.  It looks for and returns the next key in the
 * tree based on the current path and the cache_only and min_trans
 * parameters.
 *
 * 0 is returned if another key is found, < 0 if there are any errors
 * and 1 is returned if there are no higher keys in the tree
 *
 * path->keep_locks should be set to 1 on the search made before
 * calling this function.
 */
3986
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
3987 3988
			struct btrfs_key *key, int lowest_level,
			int cache_only, u64 min_trans)
3989 3990 3991 3992 3993
{
	int level = lowest_level;
	int slot;
	struct extent_buffer *c;

3994
	WARN_ON(!path->keep_locks);
C
Chris Mason 已提交
3995
	while (level < BTRFS_MAX_LEVEL) {
3996 3997 3998 3999 4000
		if (!path->nodes[level])
			return 1;

		slot = path->slots[level] + 1;
		c = path->nodes[level];
4001
next:
4002 4003
		if (slot >= btrfs_header_nritems(c)) {
			level++;
C
Chris Mason 已提交
4004
			if (level == BTRFS_MAX_LEVEL)
4005 4006 4007 4008 4009
				return 1;
			continue;
		}
		if (level == 0)
			btrfs_item_key_to_cpu(c, key, slot);
4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029
		else {
			u64 blockptr = btrfs_node_blockptr(c, slot);
			u64 gen = btrfs_node_ptr_generation(c, slot);

			if (cache_only) {
				struct extent_buffer *cur;
				cur = btrfs_find_tree_block(root, blockptr,
					    btrfs_level_size(root, level - 1));
				if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
					slot++;
					if (cur)
						free_extent_buffer(cur);
					goto next;
				}
				free_extent_buffer(cur);
			}
			if (gen < min_trans) {
				slot++;
				goto next;
			}
4030
			btrfs_node_key_to_cpu(c, key, slot);
4031
		}
4032 4033 4034 4035 4036
		return 0;
	}
	return 1;
}

C
Chris Mason 已提交
4037
/*
4038
 * search the tree again to find a leaf with greater keys
C
Chris Mason 已提交
4039 4040
 * returns 0 if it found something or 1 if there are no greater leaves.
 * returns < 0 on io errors.
C
Chris Mason 已提交
4041
 */
C
Chris Mason 已提交
4042
int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4043 4044 4045
{
	int slot;
	int level = 1;
4046 4047
	struct extent_buffer *c;
	struct extent_buffer *next = NULL;
4048 4049 4050 4051 4052
	struct btrfs_key key;
	u32 nritems;
	int ret;

	nritems = btrfs_header_nritems(path->nodes[0]);
C
Chris Mason 已提交
4053
	if (nritems == 0)
4054 4055 4056 4057 4058
		return 1;

	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);

	btrfs_release_path(root, path);
4059
	path->keep_locks = 1;
4060 4061 4062 4063 4064 4065
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	path->keep_locks = 0;

	if (ret < 0)
		return ret;

4066
	btrfs_set_path_blocking(path);
4067
	nritems = btrfs_header_nritems(path->nodes[0]);
4068 4069 4070 4071 4072 4073
	/*
	 * by releasing the path above we dropped all our locks.  A balance
	 * could have added more items next to the key that used to be
	 * at the very end of the block.  So, check again here and
	 * advance the path if there are now more items available.
	 */
4074
	if (nritems > 0 && path->slots[0] < nritems - 1) {
4075
		path->slots[0]++;
4076 4077
		goto done;
	}
4078

C
Chris Mason 已提交
4079
	while (level < BTRFS_MAX_LEVEL) {
4080
		if (!path->nodes[level])
C
Chris Mason 已提交
4081
			return 1;
4082

4083 4084
		slot = path->slots[level] + 1;
		c = path->nodes[level];
4085
		if (slot >= btrfs_header_nritems(c)) {
4086
			level++;
C
Chris Mason 已提交
4087
			if (level == BTRFS_MAX_LEVEL)
4088
				return 1;
4089 4090
			continue;
		}
4091

4092 4093
		if (next) {
			btrfs_tree_unlock(next);
4094
			free_extent_buffer(next);
4095
		}
4096

4097
		/* the path was set to blocking above */
4098 4099
		if (level == 1 && (path->locks[1] || path->skip_locking) &&
		    path->reada)
4100
			reada_for_search(root, path, level, slot, 0);
4101

4102
		next = read_node_slot(root, c, slot);
4103
		if (!path->skip_locking) {
4104
			btrfs_assert_tree_locked(c);
4105
			btrfs_tree_lock(next);
4106
			btrfs_set_lock_blocking(next);
4107
		}
4108 4109 4110
		break;
	}
	path->slots[level] = slot;
C
Chris Mason 已提交
4111
	while (1) {
4112 4113
		level--;
		c = path->nodes[level];
4114 4115
		if (path->locks[level])
			btrfs_tree_unlock(c);
4116
		free_extent_buffer(c);
4117 4118
		path->nodes[level] = next;
		path->slots[level] = 0;
4119 4120
		if (!path->skip_locking)
			path->locks[level] = 1;
4121 4122
		if (!level)
			break;
4123 4124

		btrfs_set_path_blocking(path);
4125 4126
		if (level == 1 && path->locks[1] && path->reada)
			reada_for_search(root, path, level, slot, 0);
4127
		next = read_node_slot(root, next, 0);
4128
		if (!path->skip_locking) {
4129
			btrfs_assert_tree_locked(path->nodes[level]);
4130
			btrfs_tree_lock(next);
4131
			btrfs_set_lock_blocking(next);
4132
		}
4133
	}
4134 4135
done:
	unlock_up(path, 0, 1);
4136 4137
	return 0;
}
4138

4139 4140 4141 4142 4143 4144
/*
 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
 * searching until it gets past min_objectid or finds an item of 'type'
 *
 * returns 0 if something is found, 1 if nothing was found and < 0 on error
 */
4145 4146 4147 4148 4149 4150
int btrfs_previous_item(struct btrfs_root *root,
			struct btrfs_path *path, u64 min_objectid,
			int type)
{
	struct btrfs_key found_key;
	struct extent_buffer *leaf;
4151
	u32 nritems;
4152 4153
	int ret;

C
Chris Mason 已提交
4154
	while (1) {
4155
		if (path->slots[0] == 0) {
4156
			btrfs_set_path_blocking(path);
4157 4158 4159 4160 4161 4162 4163
			ret = btrfs_prev_leaf(root, path);
			if (ret != 0)
				return ret;
		} else {
			path->slots[0]--;
		}
		leaf = path->nodes[0];
4164 4165 4166 4167 4168 4169
		nritems = btrfs_header_nritems(leaf);
		if (nritems == 0)
			return 1;
		if (path->slots[0] == nritems)
			path->slots[0]--;

4170 4171 4172
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
		if (found_key.type == type)
			return 0;
4173 4174 4175 4176 4177
		if (found_key.objectid < min_objectid)
			break;
		if (found_key.objectid == min_objectid &&
		    found_key.type < type)
			break;
4178 4179 4180
	}
	return 1;
}