ctree.c 101.3 KB
Newer Older
C
Chris Mason 已提交
1
/*
C
Chris Mason 已提交
2
 * Copyright (C) 2007,2008 Oracle.  All rights reserved.
C
Chris Mason 已提交
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

19
#include <linux/sched.h>
20 21
#include "ctree.h"
#include "disk-io.h"
22
#include "transaction.h"
23
#include "print-tree.h"
24
#include "locking.h"
25

26 27 28
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
		      *root, struct btrfs_path *path, int level);
static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
29
		      *root, struct btrfs_key *ins_key,
30
		      struct btrfs_path *path, int data_size, int extend);
31 32
static int push_node_left(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root, struct extent_buffer *dst,
33
			  struct extent_buffer *src, int empty);
34 35 36 37
static int balance_node_right(struct btrfs_trans_handle *trans,
			      struct btrfs_root *root,
			      struct extent_buffer *dst_buf,
			      struct extent_buffer *src_buf);
38 39
static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		   struct btrfs_path *path, int level, int slot);
40

C
Chris Mason 已提交
41
inline void btrfs_init_path(struct btrfs_path *p)
C
Chris Mason 已提交
42
{
C
Chris Mason 已提交
43
	memset(p, 0, sizeof(*p));
C
Chris Mason 已提交
44 45
}

C
Chris Mason 已提交
46
struct btrfs_path *btrfs_alloc_path(void)
C
Chris Mason 已提交
47
{
C
Chris Mason 已提交
48 49
	struct btrfs_path *path;
	path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
50
	if (path) {
C
Chris Mason 已提交
51
		btrfs_init_path(path);
52 53
		path->reada = 1;
	}
C
Chris Mason 已提交
54
	return path;
C
Chris Mason 已提交
55 56
}

C
Chris Mason 已提交
57
/* this also releases the path */
C
Chris Mason 已提交
58
void btrfs_free_path(struct btrfs_path *p)
59
{
C
Chris Mason 已提交
60 61
	btrfs_release_path(NULL, p);
	kmem_cache_free(btrfs_path_cachep, p);
62 63
}

C
Chris Mason 已提交
64 65 66 67 68 69
/*
 * path release drops references on the extent buffers in the path
 * and it drops any locks held by this path
 *
 * It is safe to call this on paths that no locks or extent buffers held.
 */
C
Chris Mason 已提交
70
noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
71 72
{
	int i;
73

C
Chris Mason 已提交
74
	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
75
		p->slots[i] = 0;
76
		if (!p->nodes[i])
77 78 79 80 81
			continue;
		if (p->locks[i]) {
			btrfs_tree_unlock(p->nodes[i]);
			p->locks[i] = 0;
		}
82
		free_extent_buffer(p->nodes[i]);
83
		p->nodes[i] = NULL;
84 85 86
	}
}

C
Chris Mason 已提交
87 88 89 90 91 92 93 94 95 96
/*
 * safely gets a reference on the root node of a tree.  A lock
 * is not taken, so a concurrent writer may put a different node
 * at the root of the tree.  See btrfs_lock_root_node for the
 * looping required.
 *
 * The extent buffer returned by this has a reference taken, so
 * it won't disappear.  It may stop being the root of the tree
 * at any time because there are no locks held.
 */
97 98 99 100 101 102 103 104 105 106
struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
{
	struct extent_buffer *eb;
	spin_lock(&root->node_lock);
	eb = root->node;
	extent_buffer_get(eb);
	spin_unlock(&root->node_lock);
	return eb;
}

C
Chris Mason 已提交
107 108 109 110
/* loop around taking references on and locking the root node of the
 * tree until you end up with a lock on the root.  A locked buffer
 * is returned, with a reference held.
 */
111 112 113 114
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
{
	struct extent_buffer *eb;

C
Chris Mason 已提交
115
	while (1) {
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
		eb = btrfs_root_node(root);
		btrfs_tree_lock(eb);

		spin_lock(&root->node_lock);
		if (eb == root->node) {
			spin_unlock(&root->node_lock);
			break;
		}
		spin_unlock(&root->node_lock);

		btrfs_tree_unlock(eb);
		free_extent_buffer(eb);
	}
	return eb;
}

C
Chris Mason 已提交
132 133 134 135
/* cowonly root (everything not a reference counted cow subvolume), just get
 * put onto a simple dirty list.  transaction.c walks this to make sure they
 * get properly updated on disk.
 */
136 137 138 139 140 141 142 143
static void add_root_to_dirty_list(struct btrfs_root *root)
{
	if (root->track_dirty && list_empty(&root->dirty_list)) {
		list_add(&root->dirty_list,
			 &root->fs_info->dirty_cowonly_roots);
	}
}

C
Chris Mason 已提交
144 145 146 147 148
/*
 * used by snapshot creation to make a copy of a root for a tree with
 * a given objectid.  The buffer with the new root node is returned in
 * cow_ret, and this func returns zero on success or a negative error code.
 */
149 150 151 152 153 154 155 156 157
int btrfs_copy_root(struct btrfs_trans_handle *trans,
		      struct btrfs_root *root,
		      struct extent_buffer *buf,
		      struct extent_buffer **cow_ret, u64 new_root_objectid)
{
	struct extent_buffer *cow;
	u32 nritems;
	int ret = 0;
	int level;
158
	struct btrfs_root *new_root;
159

160 161 162 163 164 165
	new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
	if (!new_root)
		return -ENOMEM;

	memcpy(new_root, root, sizeof(*new_root));
	new_root->root_key.objectid = new_root_objectid;
166 167 168 169 170 171 172

	WARN_ON(root->ref_cows && trans->transid !=
		root->fs_info->running_transaction->transid);
	WARN_ON(root->ref_cows && trans->transid != root->last_trans);

	level = btrfs_header_level(buf);
	nritems = btrfs_header_nritems(buf);
Z
Zheng Yan 已提交
173 174 175 176

	cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
				     new_root_objectid, trans->transid,
				     level, buf->start, 0);
177 178
	if (IS_ERR(cow)) {
		kfree(new_root);
179
		return PTR_ERR(cow);
180
	}
181 182 183 184 185

	copy_extent_buffer(cow, buf, 0, 0, cow->len);
	btrfs_set_header_bytenr(cow, cow->start);
	btrfs_set_header_generation(cow, trans->transid);
	btrfs_set_header_owner(cow, new_root_objectid);
186
	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
187

Y
Yan Zheng 已提交
188 189 190 191
	write_extent_buffer(cow, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(cow),
			    BTRFS_FSID_SIZE);

192
	WARN_ON(btrfs_header_generation(buf) > trans->transid);
Z
Zheng Yan 已提交
193
	ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
194 195
	kfree(new_root);

196 197 198 199 200 201 202 203
	if (ret)
		return ret;

	btrfs_mark_buffer_dirty(cow);
	*cow_ret = cow;
	return 0;
}

C
Chris Mason 已提交
204
/*
C
Chris Mason 已提交
205 206 207 208
 * does the dirty work in cow of a single block.  The parent block (if
 * supplied) is updated to point to the new cow copy.  The new buffer is marked
 * dirty and returned locked.  If you modify the block it needs to be marked
 * dirty again.
C
Chris Mason 已提交
209 210 211
 *
 * search_start -- an allocation hint for the new block
 *
C
Chris Mason 已提交
212 213 214
 * empty_size -- a hint that you plan on doing more cow.  This is the size in
 * bytes the allocator should try to find free next to the block it returns.
 * This is just a hint and may be ignored by the allocator.
C
Chris Mason 已提交
215 216
 *
 * prealloc_dest -- if you have already reserved a destination for the cow,
C
Chris Mason 已提交
217 218
 * this uses that block instead of allocating a new one.
 * btrfs_alloc_reserved_extent is used to finish the allocation.
C
Chris Mason 已提交
219
 */
C
Chris Mason 已提交
220
static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
221 222 223 224
			     struct btrfs_root *root,
			     struct extent_buffer *buf,
			     struct extent_buffer *parent, int parent_slot,
			     struct extent_buffer **cow_ret,
225 226
			     u64 search_start, u64 empty_size,
			     u64 prealloc_dest)
C
Chris Mason 已提交
227
{
Z
Zheng Yan 已提交
228
	u64 parent_start;
229
	struct extent_buffer *cow;
230
	u32 nritems;
231
	int ret = 0;
232
	int level;
233
	int unlock_orig = 0;
234

235 236 237 238 239
	if (*cow_ret == buf)
		unlock_orig = 1;

	WARN_ON(!btrfs_tree_locked(buf));

Z
Zheng Yan 已提交
240 241 242 243 244
	if (parent)
		parent_start = parent->start;
	else
		parent_start = 0;

245 246
	WARN_ON(root->ref_cows && trans->transid !=
		root->fs_info->running_transaction->transid);
247
	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
248

249 250
	level = btrfs_header_level(buf);
	nritems = btrfs_header_nritems(buf);
Z
Zheng Yan 已提交
251

252 253 254 255 256 257 258
	if (prealloc_dest) {
		struct btrfs_key ins;

		ins.objectid = prealloc_dest;
		ins.offset = buf->len;
		ins.type = BTRFS_EXTENT_ITEM_KEY;

Z
Zheng Yan 已提交
259
		ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
260
						  root->root_key.objectid,
261
						  trans->transid, level, &ins);
262 263 264 265 266
		BUG_ON(ret);
		cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
					    buf->len);
	} else {
		cow = btrfs_alloc_free_block(trans, root, buf->len,
Z
Zheng Yan 已提交
267
					     parent_start,
268
					     root->root_key.objectid,
Z
Zheng Yan 已提交
269 270
					     trans->transid, level,
					     search_start, empty_size);
271
	}
272 273
	if (IS_ERR(cow))
		return PTR_ERR(cow);
274

275
	copy_extent_buffer(cow, buf, 0, 0, cow->len);
276
	btrfs_set_header_bytenr(cow, cow->start);
277 278
	btrfs_set_header_generation(cow, trans->transid);
	btrfs_set_header_owner(cow, root->root_key.objectid);
279
	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
280

Y
Yan Zheng 已提交
281 282 283 284
	write_extent_buffer(cow, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(cow),
			    BTRFS_FSID_SIZE);

285 286
	WARN_ON(btrfs_header_generation(buf) > trans->transid);
	if (btrfs_header_generation(buf) != trans->transid) {
Z
Zheng Yan 已提交
287 288
		u32 nr_extents;
		ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
289 290
		if (ret)
			return ret;
Z
Zheng Yan 已提交
291 292 293

		ret = btrfs_cache_ref(trans, root, buf, nr_extents);
		WARN_ON(ret);
Z
Zheng Yan 已提交
294 295 296 297
	} else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
		/*
		 * There are only two places that can drop reference to
		 * tree blocks owned by living reloc trees, one is here,
Y
Yan Zheng 已提交
298
		 * the other place is btrfs_drop_subtree. In both places,
Z
Zheng Yan 已提交
299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314
		 * we check reference count while tree block is locked.
		 * Furthermore, if reference count is one, it won't get
		 * increased by someone else.
		 */
		u32 refs;
		ret = btrfs_lookup_extent_ref(trans, root, buf->start,
					      buf->len, &refs);
		BUG_ON(ret);
		if (refs == 1) {
			ret = btrfs_update_ref(trans, root, buf, cow,
					       0, nritems);
			clean_tree_block(trans, root, buf);
		} else {
			ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
		}
		BUG_ON(ret);
315
	} else {
Z
Zheng Yan 已提交
316 317 318
		ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
		if (ret)
			return ret;
319 320 321
		clean_tree_block(trans, root, buf);
	}

Z
Zheng Yan 已提交
322 323 324 325 326
	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
		ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
		WARN_ON(ret);
	}

C
Chris Mason 已提交
327
	if (buf == root->node) {
328 329 330
		WARN_ON(parent && parent != buf);

		spin_lock(&root->node_lock);
C
Chris Mason 已提交
331
		root->node = cow;
332
		extent_buffer_get(cow);
333 334
		spin_unlock(&root->node_lock);

C
Chris Mason 已提交
335
		if (buf != root->commit_root) {
336
			btrfs_free_extent(trans, root, buf->start,
Z
Zheng Yan 已提交
337 338 339
					  buf->len, buf->start,
					  root->root_key.objectid,
					  btrfs_header_generation(buf),
340
					  level, 1);
C
Chris Mason 已提交
341
		}
342
		free_extent_buffer(buf);
343
		add_root_to_dirty_list(root);
C
Chris Mason 已提交
344
	} else {
345
		btrfs_set_node_blockptr(parent, parent_slot,
346
					cow->start);
347 348 349
		WARN_ON(trans->transid == 0);
		btrfs_set_node_ptr_generation(parent, parent_slot,
					      trans->transid);
C
Chris Mason 已提交
350
		btrfs_mark_buffer_dirty(parent);
351
		WARN_ON(btrfs_header_generation(parent) != trans->transid);
352
		btrfs_free_extent(trans, root, buf->start, buf->len,
Z
Zheng Yan 已提交
353
				  parent_start, btrfs_header_owner(parent),
354
				  btrfs_header_generation(parent), level, 1);
C
Chris Mason 已提交
355
	}
356 357
	if (unlock_orig)
		btrfs_tree_unlock(buf);
358
	free_extent_buffer(buf);
C
Chris Mason 已提交
359
	btrfs_mark_buffer_dirty(cow);
C
Chris Mason 已提交
360
	*cow_ret = cow;
C
Chris Mason 已提交
361 362 363
	return 0;
}

C
Chris Mason 已提交
364 365 366 367 368
/*
 * cows a single block, see __btrfs_cow_block for the real work.
 * This version of it has extra checks so that a block isn't cow'd more than
 * once per transaction, as long as it hasn't been written yet
 */
C
Chris Mason 已提交
369
noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
370 371
		    struct btrfs_root *root, struct extent_buffer *buf,
		    struct extent_buffer *parent, int parent_slot,
372
		    struct extent_buffer **cow_ret, u64 prealloc_dest)
373 374
{
	u64 search_start;
375
	int ret;
C
Chris Mason 已提交
376

377
	if (trans->transaction != root->fs_info->running_transaction) {
C
Chris Mason 已提交
378 379 380
		printk(KERN_CRIT "trans %llu running %llu\n",
		       (unsigned long long)trans->transid,
		       (unsigned long long)
381 382 383 384
		       root->fs_info->running_transaction->transid);
		WARN_ON(1);
	}
	if (trans->transid != root->fs_info->generation) {
C
Chris Mason 已提交
385 386 387
		printk(KERN_CRIT "trans %llu running %llu\n",
		       (unsigned long long)trans->transid,
		       (unsigned long long)root->fs_info->generation);
388 389
		WARN_ON(1);
	}
C
Chris Mason 已提交
390

391
	spin_lock(&root->fs_info->hash_lock);
392 393
	if (btrfs_header_generation(buf) == trans->transid &&
	    btrfs_header_owner(buf) == root->root_key.objectid &&
394
	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
395
		*cow_ret = buf;
396
		spin_unlock(&root->fs_info->hash_lock);
397
		WARN_ON(prealloc_dest);
398 399
		return 0;
	}
400
	spin_unlock(&root->fs_info->hash_lock);
401
	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
402
	ret = __btrfs_cow_block(trans, root, buf, parent,
403 404
				 parent_slot, cow_ret, search_start, 0,
				 prealloc_dest);
405
	return ret;
406 407
}

C
Chris Mason 已提交
408 409 410 411
/*
 * helper function for defrag to decide if two blocks pointed to by a
 * node are actually close by
 */
412
static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
413
{
414
	if (blocknr < other && other - (blocknr + blocksize) < 32768)
415
		return 1;
416
	if (blocknr > other && blocknr - (other + blocksize) < 32768)
417 418 419 420
		return 1;
	return 0;
}

421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444
/*
 * compare two keys in a memcmp fashion
 */
static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
{
	struct btrfs_key k1;

	btrfs_disk_key_to_cpu(&k1, disk);

	if (k1.objectid > k2->objectid)
		return 1;
	if (k1.objectid < k2->objectid)
		return -1;
	if (k1.type > k2->type)
		return 1;
	if (k1.type < k2->type)
		return -1;
	if (k1.offset > k2->offset)
		return 1;
	if (k1.offset < k2->offset)
		return -1;
	return 0;
}

445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463
/*
 * same as comp_keys only with two btrfs_key's
 */
static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
{
	if (k1->objectid > k2->objectid)
		return 1;
	if (k1->objectid < k2->objectid)
		return -1;
	if (k1->type > k2->type)
		return 1;
	if (k1->type < k2->type)
		return -1;
	if (k1->offset > k2->offset)
		return 1;
	if (k1->offset < k2->offset)
		return -1;
	return 0;
}
464

C
Chris Mason 已提交
465 466 467 468 469
/*
 * this is used by the defrag code to go through all the
 * leaves pointed to by a node and reallocate them so that
 * disk order is close to key order
 */
470
int btrfs_realloc_node(struct btrfs_trans_handle *trans,
471
		       struct btrfs_root *root, struct extent_buffer *parent,
472 473
		       int start_slot, int cache_only, u64 *last_ret,
		       struct btrfs_key *progress)
474
{
475
	struct extent_buffer *cur;
476
	u64 blocknr;
477
	u64 gen;
478 479
	u64 search_start = *last_ret;
	u64 last_block = 0;
480 481 482 483 484
	u64 other;
	u32 parent_nritems;
	int end_slot;
	int i;
	int err = 0;
485
	int parent_level;
486 487
	int uptodate;
	u32 blocksize;
488 489
	int progress_passed = 0;
	struct btrfs_disk_key disk_key;
490

491 492 493 494
	parent_level = btrfs_header_level(parent);
	if (cache_only && parent_level != 1)
		return 0;

C
Chris Mason 已提交
495
	if (trans->transaction != root->fs_info->running_transaction)
496
		WARN_ON(1);
C
Chris Mason 已提交
497
	if (trans->transid != root->fs_info->generation)
498
		WARN_ON(1);
499

500 501
	parent_nritems = btrfs_header_nritems(parent);
	blocksize = btrfs_level_size(root, parent_level - 1);
502 503 504 505 506 507 508
	end_slot = parent_nritems;

	if (parent_nritems == 1)
		return 0;

	for (i = start_slot; i < end_slot; i++) {
		int close = 1;
509

510 511 512 513 514 515 516 517
		if (!parent->map_token) {
			map_extent_buffer(parent,
					btrfs_node_key_ptr_offset(i),
					sizeof(struct btrfs_key_ptr),
					&parent->map_token, &parent->kaddr,
					&parent->map_start, &parent->map_len,
					KM_USER1);
		}
518 519 520 521 522
		btrfs_node_key(parent, &disk_key, i);
		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
			continue;

		progress_passed = 1;
523
		blocknr = btrfs_node_blockptr(parent, i);
524
		gen = btrfs_node_ptr_generation(parent, i);
525 526
		if (last_block == 0)
			last_block = blocknr;
527

528
		if (i > 0) {
529 530
			other = btrfs_node_blockptr(parent, i - 1);
			close = close_blocks(blocknr, other, blocksize);
531
		}
C
Chris Mason 已提交
532
		if (!close && i < end_slot - 2) {
533 534
			other = btrfs_node_blockptr(parent, i + 1);
			close = close_blocks(blocknr, other, blocksize);
535
		}
536 537
		if (close) {
			last_block = blocknr;
538
			continue;
539
		}
540 541 542 543 544
		if (parent->map_token) {
			unmap_extent_buffer(parent, parent->map_token,
					    KM_USER1);
			parent->map_token = NULL;
		}
545

546 547
		cur = btrfs_find_tree_block(root, blocknr, blocksize);
		if (cur)
548
			uptodate = btrfs_buffer_uptodate(cur, gen);
549 550
		else
			uptodate = 0;
551
		if (!cur || !uptodate) {
552
			if (cache_only) {
553
				free_extent_buffer(cur);
554 555
				continue;
			}
556 557
			if (!cur) {
				cur = read_tree_block(root, blocknr,
558
							 blocksize, gen);
559
			} else if (!uptodate) {
560
				btrfs_read_buffer(cur, gen);
561
			}
562
		}
563
		if (search_start == 0)
564
			search_start = last_block;
565

566
		btrfs_tree_lock(cur);
567
		err = __btrfs_cow_block(trans, root, cur, parent, i,
568
					&cur, search_start,
569
					min(16 * blocksize,
570
					    (end_slot - i) * blocksize), 0);
Y
Yan 已提交
571
		if (err) {
572
			btrfs_tree_unlock(cur);
573
			free_extent_buffer(cur);
574
			break;
Y
Yan 已提交
575
		}
576 577
		search_start = cur->start;
		last_block = cur->start;
578
		*last_ret = search_start;
579 580
		btrfs_tree_unlock(cur);
		free_extent_buffer(cur);
581
	}
582 583 584 585 586
	if (parent->map_token) {
		unmap_extent_buffer(parent, parent->map_token,
				    KM_USER1);
		parent->map_token = NULL;
	}
587 588 589
	return err;
}

C
Chris Mason 已提交
590 591 592 593 594
/*
 * The leaf data grows from end-to-front in the node.
 * this returns the address of the start of the last item,
 * which is the stop of the leaf data stack
 */
C
Chris Mason 已提交
595
static inline unsigned int leaf_data_end(struct btrfs_root *root,
596
					 struct extent_buffer *leaf)
597
{
598
	u32 nr = btrfs_header_nritems(leaf);
599
	if (nr == 0)
C
Chris Mason 已提交
600
		return BTRFS_LEAF_DATA_SIZE(root);
601
	return btrfs_item_offset_nr(leaf, nr - 1);
602 603
}

C
Chris Mason 已提交
604 605 606 607
/*
 * extra debugging checks to make sure all the items in a key are
 * well formed and in the proper order
 */
C
Chris Mason 已提交
608 609
static int check_node(struct btrfs_root *root, struct btrfs_path *path,
		      int level)
C
Chris Mason 已提交
610
{
611 612 613 614
	struct extent_buffer *parent = NULL;
	struct extent_buffer *node = path->nodes[level];
	struct btrfs_disk_key parent_key;
	struct btrfs_disk_key node_key;
C
Chris Mason 已提交
615
	int parent_slot;
616 617
	int slot;
	struct btrfs_key cpukey;
618
	u32 nritems = btrfs_header_nritems(node);
C
Chris Mason 已提交
619 620

	if (path->nodes[level + 1])
621
		parent = path->nodes[level + 1];
A
Aneesh 已提交
622

623
	slot = path->slots[level];
624 625
	BUG_ON(nritems == 0);
	if (parent) {
A
Aneesh 已提交
626
		parent_slot = path->slots[level + 1];
627 628 629
		btrfs_node_key(parent, &parent_key, parent_slot);
		btrfs_node_key(node, &node_key, 0);
		BUG_ON(memcmp(&parent_key, &node_key,
C
Chris Mason 已提交
630
			      sizeof(struct btrfs_disk_key)));
631
		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
632
		       btrfs_header_bytenr(node));
C
Chris Mason 已提交
633
	}
C
Chris Mason 已提交
634
	BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
635
	if (slot != 0) {
636 637 638
		btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
		btrfs_node_key(node, &node_key, slot);
		BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
639 640
	}
	if (slot < nritems - 1) {
641 642 643
		btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
		btrfs_node_key(node, &node_key, slot);
		BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
C
Chris Mason 已提交
644 645 646 647
	}
	return 0;
}

C
Chris Mason 已提交
648 649 650 651
/*
 * extra checking to make sure all the items in a leaf are
 * well formed and in the proper order
 */
C
Chris Mason 已提交
652 653
static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
		      int level)
C
Chris Mason 已提交
654
{
655 656
	struct extent_buffer *leaf = path->nodes[level];
	struct extent_buffer *parent = NULL;
C
Chris Mason 已提交
657
	int parent_slot;
658
	struct btrfs_key cpukey;
659 660 661
	struct btrfs_disk_key parent_key;
	struct btrfs_disk_key leaf_key;
	int slot = path->slots[0];
662

663
	u32 nritems = btrfs_header_nritems(leaf);
C
Chris Mason 已提交
664 665

	if (path->nodes[level + 1])
666
		parent = path->nodes[level + 1];
667 668 669 670 671

	if (nritems == 0)
		return 0;

	if (parent) {
A
Aneesh 已提交
672
		parent_slot = path->slots[level + 1];
673 674
		btrfs_node_key(parent, &parent_key, parent_slot);
		btrfs_item_key(leaf, &leaf_key, 0);
675

676
		BUG_ON(memcmp(&parent_key, &leaf_key,
C
Chris Mason 已提交
677
		       sizeof(struct btrfs_disk_key)));
678
		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
679
		       btrfs_header_bytenr(leaf));
680 681 682 683 684 685
	}
	if (slot != 0 && slot < nritems - 1) {
		btrfs_item_key(leaf, &leaf_key, slot);
		btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
		if (comp_keys(&leaf_key, &cpukey) <= 0) {
			btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
686
			printk(KERN_CRIT "slot %d offset bad key\n", slot);
687 688 689 690 691
			BUG_ON(1);
		}
		if (btrfs_item_offset_nr(leaf, slot - 1) !=
		       btrfs_item_end_nr(leaf, slot)) {
			btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
692
			printk(KERN_CRIT "slot %d offset bad\n", slot);
693 694
			BUG_ON(1);
		}
695 696
	}
	if (slot < nritems - 1) {
697 698 699 700 701 702
		btrfs_item_key(leaf, &leaf_key, slot);
		btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
		BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
		if (btrfs_item_offset_nr(leaf, slot) !=
			btrfs_item_end_nr(leaf, slot + 1)) {
			btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
703
			printk(KERN_CRIT "slot %d offset bad\n", slot);
704 705
			BUG_ON(1);
		}
C
Chris Mason 已提交
706
	}
707 708
	BUG_ON(btrfs_item_offset_nr(leaf, 0) +
	       btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
C
Chris Mason 已提交
709 710 711
	return 0;
}

C
Chris Mason 已提交
712
static noinline int check_block(struct btrfs_root *root,
713
				struct btrfs_path *path, int level)
C
Chris Mason 已提交
714
{
715
	return 0;
C
Chris Mason 已提交
716
	if (level == 0)
C
Chris Mason 已提交
717 718
		return check_leaf(root, path, level);
	return check_node(root, path, level);
C
Chris Mason 已提交
719 720
}

C
Chris Mason 已提交
721
/*
722 723 724
 * search for key in the extent_buffer.  The items start at offset p,
 * and they are item_size apart.  There are 'max' items in p.
 *
C
Chris Mason 已提交
725 726 727 728 729 730
 * the slot in the array is returned via slot, and it points to
 * the place where you would insert key if it is not found in
 * the array.
 *
 * slot may point to max if the key is bigger than all of the keys
 */
731 732 733 734
static noinline int generic_bin_search(struct extent_buffer *eb,
				       unsigned long p,
				       int item_size, struct btrfs_key *key,
				       int max, int *slot)
735 736 737 738 739
{
	int low = 0;
	int high = max;
	int mid;
	int ret;
740
	struct btrfs_disk_key *tmp = NULL;
741 742 743 744 745 746
	struct btrfs_disk_key unaligned;
	unsigned long offset;
	char *map_token = NULL;
	char *kaddr = NULL;
	unsigned long map_start = 0;
	unsigned long map_len = 0;
747
	int err;
748

C
Chris Mason 已提交
749
	while (low < high) {
750
		mid = (low + high) / 2;
751 752 753 754 755
		offset = p + mid * item_size;

		if (!map_token || offset < map_start ||
		    (offset + sizeof(struct btrfs_disk_key)) >
		    map_start + map_len) {
756
			if (map_token) {
757
				unmap_extent_buffer(eb, map_token, KM_USER0);
758 759
				map_token = NULL;
			}
760 761

			err = map_private_extent_buffer(eb, offset,
762 763 764 765 766 767 768 769 770 771 772 773
						sizeof(struct btrfs_disk_key),
						&map_token, &kaddr,
						&map_start, &map_len, KM_USER0);

			if (!err) {
				tmp = (struct btrfs_disk_key *)(kaddr + offset -
							map_start);
			} else {
				read_extent_buffer(eb, &unaligned,
						   offset, sizeof(unaligned));
				tmp = &unaligned;
			}
774 775 776 777 778

		} else {
			tmp = (struct btrfs_disk_key *)(kaddr + offset -
							map_start);
		}
779 780 781 782 783 784 785 786
		ret = comp_keys(tmp, key);

		if (ret < 0)
			low = mid + 1;
		else if (ret > 0)
			high = mid;
		else {
			*slot = mid;
787 788
			if (map_token)
				unmap_extent_buffer(eb, map_token, KM_USER0);
789 790 791 792
			return 0;
		}
	}
	*slot = low;
793 794
	if (map_token)
		unmap_extent_buffer(eb, map_token, KM_USER0);
795 796 797
	return 1;
}

C
Chris Mason 已提交
798 799 800 801
/*
 * simple bin_search frontend that does the right thing for
 * leaves vs nodes
 */
802 803
static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
		      int level, int *slot)
804
{
805 806 807
	if (level == 0) {
		return generic_bin_search(eb,
					  offsetof(struct btrfs_leaf, items),
C
Chris Mason 已提交
808
					  sizeof(struct btrfs_item),
809
					  key, btrfs_header_nritems(eb),
810
					  slot);
811
	} else {
812 813
		return generic_bin_search(eb,
					  offsetof(struct btrfs_node, ptrs),
C
Chris Mason 已提交
814
					  sizeof(struct btrfs_key_ptr),
815
					  key, btrfs_header_nritems(eb),
816
					  slot);
817 818 819 820
	}
	return -1;
}

C
Chris Mason 已提交
821 822 823 824
/* given a node and slot number, this reads the blocks it points to.  The
 * extent buffer is returned with a reference taken (but unlocked).
 * NULL is returned on error.
 */
825
static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
826
				   struct extent_buffer *parent, int slot)
827
{
828
	int level = btrfs_header_level(parent);
829 830
	if (slot < 0)
		return NULL;
831
	if (slot >= btrfs_header_nritems(parent))
832
		return NULL;
833 834 835

	BUG_ON(level == 0);

836
	return read_tree_block(root, btrfs_node_blockptr(parent, slot),
837 838
		       btrfs_level_size(root, level - 1),
		       btrfs_node_ptr_generation(parent, slot));
839 840
}

C
Chris Mason 已提交
841 842 843 844 845
/*
 * node level balancing, used to make sure nodes are in proper order for
 * item deletion.  We balance from the top down, so we have to make sure
 * that a deletion won't leave an node completely empty later on.
 */
846
static noinline int balance_level(struct btrfs_trans_handle *trans,
847 848
			 struct btrfs_root *root,
			 struct btrfs_path *path, int level)
849
{
850 851 852 853
	struct extent_buffer *right = NULL;
	struct extent_buffer *mid;
	struct extent_buffer *left = NULL;
	struct extent_buffer *parent = NULL;
854 855 856 857
	int ret = 0;
	int wret;
	int pslot;
	int orig_slot = path->slots[level];
858
	int err_on_enospc = 0;
859
	u64 orig_ptr;
860 861 862 863

	if (level == 0)
		return 0;

864
	mid = path->nodes[level];
865
	WARN_ON(!path->locks[level]);
866 867
	WARN_ON(btrfs_header_generation(mid) != trans->transid);

868
	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
869

C
Chris Mason 已提交
870
	if (level < BTRFS_MAX_LEVEL - 1)
871
		parent = path->nodes[level + 1];
872 873
	pslot = path->slots[level + 1];

C
Chris Mason 已提交
874 875 876 877
	/*
	 * deal with the case where there is only one pointer in the root
	 * by promoting the node below to a root
	 */
878 879
	if (!parent) {
		struct extent_buffer *child;
880

881
		if (btrfs_header_nritems(mid) != 1)
882 883 884
			return 0;

		/* promote the child to a root */
885
		child = read_node_slot(root, mid, 0);
886
		btrfs_tree_lock(child);
887
		BUG_ON(!child);
888
		ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
889 890
		BUG_ON(ret);

891
		spin_lock(&root->node_lock);
892
		root->node = child;
893 894
		spin_unlock(&root->node_lock);

Z
Zheng Yan 已提交
895 896 897
		ret = btrfs_update_extent_ref(trans, root, child->start,
					      mid->start, child->start,
					      root->root_key.objectid,
898
					      trans->transid, level - 1);
Z
Zheng Yan 已提交
899 900
		BUG_ON(ret);

901
		add_root_to_dirty_list(root);
902 903
		btrfs_tree_unlock(child);
		path->locks[level] = 0;
904
		path->nodes[level] = NULL;
905
		clean_tree_block(trans, root, mid);
906
		btrfs_tree_unlock(mid);
907
		/* once for the path */
908
		free_extent_buffer(mid);
909
		ret = btrfs_free_extent(trans, root, mid->start, mid->len,
Z
Zheng Yan 已提交
910
					mid->start, root->root_key.objectid,
911 912
					btrfs_header_generation(mid),
					level, 1);
913
		/* once for the root ptr */
914
		free_extent_buffer(mid);
915
		return ret;
916
	}
917
	if (btrfs_header_nritems(mid) >
C
Chris Mason 已提交
918
	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
919 920
		return 0;

921
	if (btrfs_header_nritems(mid) < 2)
922 923
		err_on_enospc = 1;

924 925
	left = read_node_slot(root, parent, pslot - 1);
	if (left) {
926
		btrfs_tree_lock(left);
927
		wret = btrfs_cow_block(trans, root, left,
928
				       parent, pslot - 1, &left, 0);
929 930 931 932
		if (wret) {
			ret = wret;
			goto enospc;
		}
933
	}
934 935
	right = read_node_slot(root, parent, pslot + 1);
	if (right) {
936
		btrfs_tree_lock(right);
937
		wret = btrfs_cow_block(trans, root, right,
938
				       parent, pslot + 1, &right, 0);
939 940 941 942 943 944 945
		if (wret) {
			ret = wret;
			goto enospc;
		}
	}

	/* first, try to make some room in the middle buffer */
946 947
	if (left) {
		orig_slot += btrfs_header_nritems(left);
948
		wret = push_node_left(trans, root, left, mid, 1);
949 950
		if (wret < 0)
			ret = wret;
951
		if (btrfs_header_nritems(mid) < 2)
952
			err_on_enospc = 1;
953
	}
954 955 956 957

	/*
	 * then try to empty the right most buffer into the middle
	 */
958
	if (right) {
959
		wret = push_node_left(trans, root, mid, right, 1);
960
		if (wret < 0 && wret != -ENOSPC)
961
			ret = wret;
962
		if (btrfs_header_nritems(right) == 0) {
963
			u64 bytenr = right->start;
964
			u64 generation = btrfs_header_generation(parent);
965 966
			u32 blocksize = right->len;

967
			clean_tree_block(trans, root, right);
968
			btrfs_tree_unlock(right);
969
			free_extent_buffer(right);
970
			right = NULL;
971 972
			wret = del_ptr(trans, root, path, level + 1, pslot +
				       1);
973 974
			if (wret)
				ret = wret;
975
			wret = btrfs_free_extent(trans, root, bytenr,
Z
Zheng Yan 已提交
976
						 blocksize, parent->start,
977
						 btrfs_header_owner(parent),
978
						 generation, level, 1);
979 980 981
			if (wret)
				ret = wret;
		} else {
982 983 984 985
			struct btrfs_disk_key right_key;
			btrfs_node_key(right, &right_key, 0);
			btrfs_set_node_key(parent, &right_key, pslot + 1);
			btrfs_mark_buffer_dirty(parent);
986 987
		}
	}
988
	if (btrfs_header_nritems(mid) == 1) {
989 990 991 992 993 994 995 996 997
		/*
		 * we're not allowed to leave a node with one item in the
		 * tree during a delete.  A deletion from lower in the tree
		 * could try to delete the only pointer in this node.
		 * So, pull some keys from the left.
		 * There has to be a left pointer at this point because
		 * otherwise we would have pulled some pointers from the
		 * right
		 */
998 999
		BUG_ON(!left);
		wret = balance_node_right(trans, root, mid, left);
1000
		if (wret < 0) {
1001
			ret = wret;
1002 1003
			goto enospc;
		}
1004 1005 1006 1007 1008
		if (wret == 1) {
			wret = push_node_left(trans, root, left, mid, 1);
			if (wret < 0)
				ret = wret;
		}
1009 1010
		BUG_ON(wret == 1);
	}
1011
	if (btrfs_header_nritems(mid) == 0) {
1012
		/* we've managed to empty the middle node, drop it */
1013
		u64 root_gen = btrfs_header_generation(parent);
1014 1015
		u64 bytenr = mid->start;
		u32 blocksize = mid->len;
1016

1017
		clean_tree_block(trans, root, mid);
1018
		btrfs_tree_unlock(mid);
1019
		free_extent_buffer(mid);
1020
		mid = NULL;
1021
		wret = del_ptr(trans, root, path, level + 1, pslot);
1022 1023
		if (wret)
			ret = wret;
1024
		wret = btrfs_free_extent(trans, root, bytenr, blocksize,
Z
Zheng Yan 已提交
1025
					 parent->start,
1026
					 btrfs_header_owner(parent),
1027
					 root_gen, level, 1);
1028 1029
		if (wret)
			ret = wret;
1030 1031
	} else {
		/* update the parent key to reflect our changes */
1032 1033 1034 1035
		struct btrfs_disk_key mid_key;
		btrfs_node_key(mid, &mid_key, 0);
		btrfs_set_node_key(parent, &mid_key, pslot);
		btrfs_mark_buffer_dirty(parent);
1036
	}
1037

1038
	/* update the path */
1039 1040 1041
	if (left) {
		if (btrfs_header_nritems(left) > orig_slot) {
			extent_buffer_get(left);
1042
			/* left was locked after cow */
1043
			path->nodes[level] = left;
1044 1045
			path->slots[level + 1] -= 1;
			path->slots[level] = orig_slot;
1046 1047
			if (mid) {
				btrfs_tree_unlock(mid);
1048
				free_extent_buffer(mid);
1049
			}
1050
		} else {
1051
			orig_slot -= btrfs_header_nritems(left);
1052 1053 1054
			path->slots[level] = orig_slot;
		}
	}
1055
	/* double check we haven't messed things up */
C
Chris Mason 已提交
1056
	check_block(root, path, level);
C
Chris Mason 已提交
1057
	if (orig_ptr !=
1058
	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1059
		BUG();
1060
enospc:
1061 1062
	if (right) {
		btrfs_tree_unlock(right);
1063
		free_extent_buffer(right);
1064 1065 1066 1067
	}
	if (left) {
		if (path->nodes[level] != left)
			btrfs_tree_unlock(left);
1068
		free_extent_buffer(left);
1069
	}
1070 1071 1072
	return ret;
}

C
Chris Mason 已提交
1073 1074 1075 1076
/* Node balancing for insertion.  Here we only split or push nodes around
 * when they are completely full.  This is also done top down, so we
 * have to be pessimistic.
 */
C
Chris Mason 已提交
1077
static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1078 1079
					  struct btrfs_root *root,
					  struct btrfs_path *path, int level)
1080
{
1081 1082 1083 1084
	struct extent_buffer *right = NULL;
	struct extent_buffer *mid;
	struct extent_buffer *left = NULL;
	struct extent_buffer *parent = NULL;
1085 1086 1087 1088 1089 1090 1091 1092 1093
	int ret = 0;
	int wret;
	int pslot;
	int orig_slot = path->slots[level];
	u64 orig_ptr;

	if (level == 0)
		return 1;

1094
	mid = path->nodes[level];
1095
	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1096 1097 1098
	orig_ptr = btrfs_node_blockptr(mid, orig_slot);

	if (level < BTRFS_MAX_LEVEL - 1)
1099
		parent = path->nodes[level + 1];
1100 1101
	pslot = path->slots[level + 1];

1102
	if (!parent)
1103 1104
		return 1;

1105
	left = read_node_slot(root, parent, pslot - 1);
1106 1107

	/* first, try to make some room in the middle buffer */
1108
	if (left) {
1109
		u32 left_nr;
1110 1111

		btrfs_tree_lock(left);
1112
		left_nr = btrfs_header_nritems(left);
C
Chris Mason 已提交
1113 1114 1115
		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
			wret = 1;
		} else {
1116
			ret = btrfs_cow_block(trans, root, left, parent,
1117
					      pslot - 1, &left, 0);
1118 1119 1120 1121
			if (ret)
				wret = 1;
			else {
				wret = push_node_left(trans, root,
1122
						      left, mid, 0);
1123
			}
C
Chris Mason 已提交
1124
		}
1125 1126 1127
		if (wret < 0)
			ret = wret;
		if (wret == 0) {
1128
			struct btrfs_disk_key disk_key;
1129
			orig_slot += left_nr;
1130 1131 1132 1133 1134
			btrfs_node_key(mid, &disk_key, 0);
			btrfs_set_node_key(parent, &disk_key, pslot);
			btrfs_mark_buffer_dirty(parent);
			if (btrfs_header_nritems(left) > orig_slot) {
				path->nodes[level] = left;
1135 1136
				path->slots[level + 1] -= 1;
				path->slots[level] = orig_slot;
1137
				btrfs_tree_unlock(mid);
1138
				free_extent_buffer(mid);
1139 1140
			} else {
				orig_slot -=
1141
					btrfs_header_nritems(left);
1142
				path->slots[level] = orig_slot;
1143
				btrfs_tree_unlock(left);
1144
				free_extent_buffer(left);
1145 1146 1147
			}
			return 0;
		}
1148
		btrfs_tree_unlock(left);
1149
		free_extent_buffer(left);
1150
	}
1151
	right = read_node_slot(root, parent, pslot + 1);
1152 1153 1154 1155

	/*
	 * then try to empty the right most buffer into the middle
	 */
1156
	if (right) {
C
Chris Mason 已提交
1157
		u32 right_nr;
1158
		btrfs_tree_lock(right);
1159
		right_nr = btrfs_header_nritems(right);
C
Chris Mason 已提交
1160 1161 1162
		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
			wret = 1;
		} else {
1163 1164
			ret = btrfs_cow_block(trans, root, right,
					      parent, pslot + 1,
1165
					      &right, 0);
1166 1167 1168 1169
			if (ret)
				wret = 1;
			else {
				wret = balance_node_right(trans, root,
1170
							  right, mid);
1171
			}
C
Chris Mason 已提交
1172
		}
1173 1174 1175
		if (wret < 0)
			ret = wret;
		if (wret == 0) {
1176 1177 1178 1179 1180 1181 1182 1183
			struct btrfs_disk_key disk_key;

			btrfs_node_key(right, &disk_key, 0);
			btrfs_set_node_key(parent, &disk_key, pslot + 1);
			btrfs_mark_buffer_dirty(parent);

			if (btrfs_header_nritems(mid) <= orig_slot) {
				path->nodes[level] = right;
1184 1185
				path->slots[level + 1] += 1;
				path->slots[level] = orig_slot -
1186
					btrfs_header_nritems(mid);
1187
				btrfs_tree_unlock(mid);
1188
				free_extent_buffer(mid);
1189
			} else {
1190
				btrfs_tree_unlock(right);
1191
				free_extent_buffer(right);
1192 1193 1194
			}
			return 0;
		}
1195
		btrfs_tree_unlock(right);
1196
		free_extent_buffer(right);
1197 1198 1199 1200
	}
	return 1;
}

1201
/*
C
Chris Mason 已提交
1202 1203
 * readahead one full node of leaves, finding things that are close
 * to the block in 'slot', and triggering ra on them.
1204
 */
1205 1206 1207
static noinline void reada_for_search(struct btrfs_root *root,
				      struct btrfs_path *path,
				      int level, int slot, u64 objectid)
1208
{
1209
	struct extent_buffer *node;
1210
	struct btrfs_disk_key disk_key;
1211 1212
	u32 nritems;
	u64 search;
1213
	u64 target;
1214
	u64 nread = 0;
1215
	int direction = path->reada;
1216
	struct extent_buffer *eb;
1217 1218 1219
	u32 nr;
	u32 blocksize;
	u32 nscan = 0;
1220

1221
	if (level != 1)
1222 1223 1224
		return;

	if (!path->nodes[level])
1225 1226
		return;

1227
	node = path->nodes[level];
1228

1229
	search = btrfs_node_blockptr(node, slot);
1230 1231
	blocksize = btrfs_level_size(root, level - 1);
	eb = btrfs_find_tree_block(root, search, blocksize);
1232 1233
	if (eb) {
		free_extent_buffer(eb);
1234 1235 1236
		return;
	}

1237
	target = search;
1238

1239
	nritems = btrfs_header_nritems(node);
1240
	nr = slot;
C
Chris Mason 已提交
1241
	while (1) {
1242 1243 1244 1245 1246 1247 1248 1249
		if (direction < 0) {
			if (nr == 0)
				break;
			nr--;
		} else if (direction > 0) {
			nr++;
			if (nr >= nritems)
				break;
1250
		}
1251 1252 1253 1254 1255
		if (path->reada < 0 && objectid) {
			btrfs_node_key(node, &disk_key, nr);
			if (btrfs_disk_key_objectid(&disk_key) != objectid)
				break;
		}
1256
		search = btrfs_node_blockptr(node, nr);
1257 1258
		if ((search <= target && target - search <= 65536) ||
		    (search > target && search - target <= 65536)) {
1259 1260
			readahead_tree_block(root, search, blocksize,
				     btrfs_node_ptr_generation(node, nr));
1261 1262 1263
			nread += blocksize;
		}
		nscan++;
1264
		if ((nread > 65536 || nscan > 32))
1265
			break;
1266 1267
	}
}
1268

C
Chris Mason 已提交
1269
/*
C
Chris Mason 已提交
1270 1271 1272 1273
 * when we walk down the tree, it is usually safe to unlock the higher layers
 * in the tree.  The exceptions are when our path goes through slot 0, because
 * operations on the tree might require changing key pointers higher up in the
 * tree.
C
Chris Mason 已提交
1274
 *
C
Chris Mason 已提交
1275 1276 1277
 * callers might also have set path->keep_locks, which tells this code to keep
 * the lock if the path points to the last slot in the block.  This is part of
 * walking through the tree, and selecting the next slot in the higher block.
C
Chris Mason 已提交
1278
 *
C
Chris Mason 已提交
1279 1280
 * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
 * if lowest_unlock is 1, level 0 won't be unlocked
C
Chris Mason 已提交
1281
 */
1282 1283
static noinline void unlock_up(struct btrfs_path *path, int level,
			       int lowest_unlock)
1284 1285 1286
{
	int i;
	int skip_level = level;
1287
	int no_skips = 0;
1288 1289 1290 1291 1292 1293 1294
	struct extent_buffer *t;

	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
		if (!path->nodes[i])
			break;
		if (!path->locks[i])
			break;
1295
		if (!no_skips && path->slots[i] == 0) {
1296 1297 1298
			skip_level = i + 1;
			continue;
		}
1299
		if (!no_skips && path->keep_locks) {
1300 1301 1302
			u32 nritems;
			t = path->nodes[i];
			nritems = btrfs_header_nritems(t);
1303
			if (nritems < 1 || path->slots[i] >= nritems - 1) {
1304 1305 1306 1307
				skip_level = i + 1;
				continue;
			}
		}
1308 1309 1310
		if (skip_level < i && i >= lowest_unlock)
			no_skips = 1;

1311 1312 1313 1314 1315 1316 1317 1318
		t = path->nodes[i];
		if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
			btrfs_tree_unlock(t);
			path->locks[i] = 0;
		}
	}
}

C
Chris Mason 已提交
1319 1320 1321 1322 1323 1324
/*
 * look for key in the tree.  path is filled in with nodes along the way
 * if key is found, we return zero and you can find the item in the leaf
 * level of the path (level 0)
 *
 * If the key isn't found, the path points to the slot where it should
C
Chris Mason 已提交
1325 1326
 * be inserted, and 1 is returned.  If there are other errors during the
 * search a negative error number is returned.
C
Chris Mason 已提交
1327 1328 1329 1330
 *
 * if ins_len > 0, nodes and leaves will be split as we walk down the
 * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
 * possible)
C
Chris Mason 已提交
1331
 */
1332 1333 1334
int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
		      *root, struct btrfs_key *key, struct btrfs_path *p, int
		      ins_len, int cow)
1335
{
1336
	struct extent_buffer *b;
1337
	struct extent_buffer *tmp;
1338 1339 1340
	int slot;
	int ret;
	int level;
1341
	int should_reada = p->reada;
1342
	int lowest_unlock = 1;
1343
	int blocksize;
1344
	u8 lowest_level = 0;
1345 1346
	u64 blocknr;
	u64 gen;
1347
	struct btrfs_key prealloc_block;
1348

1349
	lowest_level = p->lowest_level;
1350
	WARN_ON(lowest_level && ins_len > 0);
C
Chris Mason 已提交
1351
	WARN_ON(p->nodes[0] != NULL);
1352

1353 1354
	if (ins_len < 0)
		lowest_unlock = 2;
1355 1356 1357

	prealloc_block.objectid = 0;

1358
again:
1359 1360 1361 1362
	if (p->skip_locking)
		b = btrfs_root_node(root);
	else
		b = btrfs_lock_root_node(root);
1363

1364
	while (b) {
1365
		level = btrfs_header_level(b);
1366 1367 1368 1369 1370 1371 1372 1373 1374

		/*
		 * setup the path here so we can release it under lock
		 * contention with the cow code
		 */
		p->nodes[level] = b;
		if (!p->skip_locking)
			p->locks[level] = 1;

C
Chris Mason 已提交
1375 1376
		if (cow) {
			int wret;
1377 1378 1379 1380

			/* is a cow on this block not required */
			spin_lock(&root->fs_info->hash_lock);
			if (btrfs_header_generation(b) == trans->transid &&
1381
			    btrfs_header_owner(b) == root->root_key.objectid &&
1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416
			    !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
				spin_unlock(&root->fs_info->hash_lock);
				goto cow_done;
			}
			spin_unlock(&root->fs_info->hash_lock);

			/* ok, we have to cow, is our old prealloc the right
			 * size?
			 */
			if (prealloc_block.objectid &&
			    prealloc_block.offset != b->len) {
				btrfs_free_reserved_extent(root,
					   prealloc_block.objectid,
					   prealloc_block.offset);
				prealloc_block.objectid = 0;
			}

			/*
			 * for higher level blocks, try not to allocate blocks
			 * with the block and the parent locks held.
			 */
			if (level > 1 && !prealloc_block.objectid &&
			    btrfs_path_lock_waiting(p, level)) {
				u32 size = b->len;
				u64 hint = b->start;

				btrfs_release_path(root, p);
				ret = btrfs_reserve_extent(trans, root,
							   size, size, 0,
							   hint, (u64)-1,
							   &prealloc_block, 0);
				BUG_ON(ret);
				goto again;
			}

C
Chris Mason 已提交
1417 1418 1419
			wret = btrfs_cow_block(trans, root, b,
					       p->nodes[level + 1],
					       p->slots[level + 1],
1420 1421
					       &b, prealloc_block.objectid);
			prealloc_block.objectid = 0;
1422
			if (wret) {
1423
				free_extent_buffer(b);
1424 1425
				ret = wret;
				goto done;
1426
			}
C
Chris Mason 已提交
1427
		}
1428
cow_done:
C
Chris Mason 已提交
1429
		BUG_ON(!cow && ins_len);
1430
		if (level != btrfs_header_level(b))
C
Chris Mason 已提交
1431
			WARN_ON(1);
1432
		level = btrfs_header_level(b);
1433

1434
		p->nodes[level] = b;
1435 1436
		if (!p->skip_locking)
			p->locks[level] = 1;
1437

C
Chris Mason 已提交
1438
		ret = check_block(root, p, level);
1439 1440 1441 1442
		if (ret) {
			ret = -1;
			goto done;
		}
1443

1444 1445
		ret = bin_search(b, key, level, &slot);
		if (level != 0) {
1446 1447 1448
			if (ret && slot > 0)
				slot -= 1;
			p->slots[level] = slot;
1449 1450
			if ((p->search_for_split || ins_len > 0) &&
			    btrfs_header_nritems(b) >=
1451
			    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1452
				int sret = split_node(trans, root, p, level);
C
Chris Mason 已提交
1453
				BUG_ON(sret > 0);
1454 1455 1456 1457
				if (sret) {
					ret = sret;
					goto done;
				}
C
Chris Mason 已提交
1458 1459
				b = p->nodes[level];
				slot = p->slots[level];
1460
			} else if (ins_len < 0) {
1461 1462
				int sret = balance_level(trans, root, p,
							 level);
1463 1464 1465 1466
				if (sret) {
					ret = sret;
					goto done;
				}
1467
				b = p->nodes[level];
1468 1469
				if (!b) {
					btrfs_release_path(NULL, p);
1470
					goto again;
1471
				}
1472
				slot = p->slots[level];
1473
				BUG_ON(btrfs_header_nritems(b) == 1);
C
Chris Mason 已提交
1474
			}
1475 1476
			unlock_up(p, level, lowest_unlock);

1477
			/* this is only true while dropping a snapshot */
1478
			if (level == lowest_level) {
1479 1480
				ret = 0;
				goto done;
1481
			}
1482

1483 1484 1485 1486 1487 1488
			blocknr = btrfs_node_blockptr(b, slot);
			gen = btrfs_node_ptr_generation(b, slot);
			blocksize = btrfs_level_size(root, level - 1);

			tmp = btrfs_find_tree_block(root, blocknr, blocksize);
			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1489 1490 1491 1492 1493 1494 1495 1496 1497
				b = tmp;
			} else {
				/*
				 * reduce lock contention at high levels
				 * of the btree by dropping locks before
				 * we read.
				 */
				if (level > 1) {
					btrfs_release_path(NULL, p);
1498 1499
					if (tmp)
						free_extent_buffer(tmp);
1500 1501 1502 1503 1504
					if (should_reada)
						reada_for_search(root, p,
								 level, slot,
								 key->objectid);

1505 1506
					tmp = read_tree_block(root, blocknr,
							 blocksize, gen);
1507 1508 1509 1510
					if (tmp)
						free_extent_buffer(tmp);
					goto again;
				} else {
1511 1512
					if (tmp)
						free_extent_buffer(tmp);
1513 1514 1515 1516
					if (should_reada)
						reada_for_search(root, p,
								 level, slot,
								 key->objectid);
1517 1518 1519
					b = read_node_slot(root, b, slot);
				}
			}
1520 1521
			if (!p->skip_locking)
				btrfs_tree_lock(b);
1522 1523
		} else {
			p->slots[level] = slot;
1524 1525
			if (ins_len > 0 &&
			    btrfs_leaf_free_space(root, b) < ins_len) {
1526
				int sret = split_leaf(trans, root, key,
1527
						      p, ins_len, ret == 0);
C
Chris Mason 已提交
1528
				BUG_ON(sret > 0);
1529 1530 1531 1532
				if (sret) {
					ret = sret;
					goto done;
				}
C
Chris Mason 已提交
1533
			}
1534 1535
			if (!p->search_for_split)
				unlock_up(p, level, lowest_unlock);
1536
			goto done;
1537 1538
		}
	}
1539 1540 1541 1542 1543 1544 1545 1546 1547
	ret = 1;
done:
	if (prealloc_block.objectid) {
		btrfs_free_reserved_extent(root,
			   prealloc_block.objectid,
			   prealloc_block.offset);
	}

	return ret;
1548 1549
}

Z
Zheng Yan 已提交
1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589
int btrfs_merge_path(struct btrfs_trans_handle *trans,
		     struct btrfs_root *root,
		     struct btrfs_key *node_keys,
		     u64 *nodes, int lowest_level)
{
	struct extent_buffer *eb;
	struct extent_buffer *parent;
	struct btrfs_key key;
	u64 bytenr;
	u64 generation;
	u32 blocksize;
	int level;
	int slot;
	int key_match;
	int ret;

	eb = btrfs_lock_root_node(root);
	ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
	BUG_ON(ret);

	parent = eb;
	while (1) {
		level = btrfs_header_level(parent);
		if (level == 0 || level <= lowest_level)
			break;

		ret = bin_search(parent, &node_keys[lowest_level], level,
				 &slot);
		if (ret && slot > 0)
			slot--;

		bytenr = btrfs_node_blockptr(parent, slot);
		if (nodes[level - 1] == bytenr)
			break;

		blocksize = btrfs_level_size(root, level - 1);
		generation = btrfs_node_ptr_generation(parent, slot);
		btrfs_node_key_to_cpu(eb, &key, slot);
		key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));

Y
Yan Zheng 已提交
1590 1591 1592 1593 1594 1595
		if (generation == trans->transid) {
			eb = read_tree_block(root, bytenr, blocksize,
					     generation);
			btrfs_tree_lock(eb);
		}

Z
Zheng Yan 已提交
1596 1597 1598
		/*
		 * if node keys match and node pointer hasn't been modified
		 * in the running transaction, we can merge the path. for
Y
Yan Zheng 已提交
1599 1600 1601
		 * blocks owened by reloc trees, the node pointer check is
		 * skipped, this is because these blocks are fully controlled
		 * by the space balance code, no one else can modify them.
Z
Zheng Yan 已提交
1602 1603 1604
		 */
		if (!nodes[level - 1] || !key_match ||
		    (generation == trans->transid &&
Y
Yan Zheng 已提交
1605 1606 1607 1608 1609 1610
		     btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
			if (level == 1 || level == lowest_level + 1) {
				if (generation == trans->transid) {
					btrfs_tree_unlock(eb);
					free_extent_buffer(eb);
				}
Z
Zheng Yan 已提交
1611
				break;
Y
Yan Zheng 已提交
1612
			}
Z
Zheng Yan 已提交
1613

Y
Yan Zheng 已提交
1614 1615 1616 1617 1618
			if (generation != trans->transid) {
				eb = read_tree_block(root, bytenr, blocksize,
						generation);
				btrfs_tree_lock(eb);
			}
Z
Zheng Yan 已提交
1619 1620 1621 1622 1623

			ret = btrfs_cow_block(trans, root, eb, parent, slot,
					      &eb, 0);
			BUG_ON(ret);

Y
Yan Zheng 已提交
1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634
			if (root->root_key.objectid ==
			    BTRFS_TREE_RELOC_OBJECTID) {
				if (!nodes[level - 1]) {
					nodes[level - 1] = eb->start;
					memcpy(&node_keys[level - 1], &key,
					       sizeof(node_keys[0]));
				} else {
					WARN_ON(1);
				}
			}

Z
Zheng Yan 已提交
1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649
			btrfs_tree_unlock(parent);
			free_extent_buffer(parent);
			parent = eb;
			continue;
		}

		btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
		btrfs_set_node_ptr_generation(parent, slot, trans->transid);
		btrfs_mark_buffer_dirty(parent);

		ret = btrfs_inc_extent_ref(trans, root,
					nodes[level - 1],
					blocksize, parent->start,
					btrfs_header_owner(parent),
					btrfs_header_generation(parent),
1650
					level - 1);
Z
Zheng Yan 已提交
1651 1652
		BUG_ON(ret);

Y
Yan Zheng 已提交
1653 1654 1655 1656 1657
		/*
		 * If the block was created in the running transaction,
		 * it's possible this is the last reference to it, so we
		 * should drop the subtree.
		 */
Z
Zheng Yan 已提交
1658
		if (generation == trans->transid) {
Y
Yan Zheng 已提交
1659 1660
			ret = btrfs_drop_subtree(trans, root, eb, parent);
			BUG_ON(ret);
Z
Zheng Yan 已提交
1661 1662
			btrfs_tree_unlock(eb);
			free_extent_buffer(eb);
Y
Yan Zheng 已提交
1663 1664 1665 1666 1667 1668 1669
		} else {
			ret = btrfs_free_extent(trans, root, bytenr,
					blocksize, parent->start,
					btrfs_header_owner(parent),
					btrfs_header_generation(parent),
					level - 1, 1);
			BUG_ON(ret);
Z
Zheng Yan 已提交
1670 1671 1672 1673 1674 1675 1676 1677
		}
		break;
	}
	btrfs_tree_unlock(parent);
	free_extent_buffer(parent);
	return 0;
}

C
Chris Mason 已提交
1678 1679 1680 1681 1682 1683
/*
 * adjust the pointers going up the tree, starting at level
 * making sure the right key of each node is points to 'key'.
 * This is used after shifting pointers to the left, so it stops
 * fixing up pointers when a given leaf/node is not in slot 0 of the
 * higher levels
C
Chris Mason 已提交
1684 1685 1686
 *
 * If this fails to write a tree block, it returns -1, but continues
 * fixing up the blocks in ram so the tree is consistent.
C
Chris Mason 已提交
1687
 */
1688 1689 1690
static int fixup_low_keys(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root, struct btrfs_path *path,
			  struct btrfs_disk_key *key, int level)
1691 1692
{
	int i;
C
Chris Mason 已提交
1693
	int ret = 0;
1694 1695
	struct extent_buffer *t;

C
Chris Mason 已提交
1696
	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1697
		int tslot = path->slots[i];
1698
		if (!path->nodes[i])
1699
			break;
1700 1701
		t = path->nodes[i];
		btrfs_set_node_key(t, key, tslot);
C
Chris Mason 已提交
1702
		btrfs_mark_buffer_dirty(path->nodes[i]);
1703 1704 1705
		if (tslot != 0)
			break;
	}
C
Chris Mason 已提交
1706
	return ret;
1707 1708
}

Z
Zheng Yan 已提交
1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743
/*
 * update item key.
 *
 * This function isn't completely safe. It's the caller's responsibility
 * that the new key won't break the order
 */
int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root, struct btrfs_path *path,
			    struct btrfs_key *new_key)
{
	struct btrfs_disk_key disk_key;
	struct extent_buffer *eb;
	int slot;

	eb = path->nodes[0];
	slot = path->slots[0];
	if (slot > 0) {
		btrfs_item_key(eb, &disk_key, slot - 1);
		if (comp_keys(&disk_key, new_key) >= 0)
			return -1;
	}
	if (slot < btrfs_header_nritems(eb) - 1) {
		btrfs_item_key(eb, &disk_key, slot + 1);
		if (comp_keys(&disk_key, new_key) <= 0)
			return -1;
	}

	btrfs_cpu_key_to_disk(&disk_key, new_key);
	btrfs_set_item_key(eb, &disk_key, slot);
	btrfs_mark_buffer_dirty(eb);
	if (slot == 0)
		fixup_low_keys(trans, root, path, &disk_key, 1);
	return 0;
}

C
Chris Mason 已提交
1744 1745
/*
 * try to push data from one node into the next node left in the
1746
 * tree.
C
Chris Mason 已提交
1747 1748 1749
 *
 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
 * error, and > 0 if there was no room in the left hand block.
C
Chris Mason 已提交
1750
 */
1751 1752
static int push_node_left(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root, struct extent_buffer *dst,
1753
			  struct extent_buffer *src, int empty)
1754 1755
{
	int push_items = 0;
1756 1757
	int src_nritems;
	int dst_nritems;
C
Chris Mason 已提交
1758
	int ret = 0;
1759

1760 1761
	src_nritems = btrfs_header_nritems(src);
	dst_nritems = btrfs_header_nritems(dst);
C
Chris Mason 已提交
1762
	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1763 1764
	WARN_ON(btrfs_header_generation(src) != trans->transid);
	WARN_ON(btrfs_header_generation(dst) != trans->transid);
1765

1766
	if (!empty && src_nritems <= 8)
1767 1768
		return 1;

C
Chris Mason 已提交
1769
	if (push_items <= 0)
1770 1771
		return 1;

1772
	if (empty) {
1773
		push_items = min(src_nritems, push_items);
1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785
		if (push_items < src_nritems) {
			/* leave at least 8 pointers in the node if
			 * we aren't going to empty it
			 */
			if (src_nritems - push_items < 8) {
				if (push_items <= 8)
					return 1;
				push_items -= 8;
			}
		}
	} else
		push_items = min(src_nritems - 8, push_items);
1786

1787 1788 1789
	copy_extent_buffer(dst, src,
			   btrfs_node_key_ptr_offset(dst_nritems),
			   btrfs_node_key_ptr_offset(0),
C
Chris Mason 已提交
1790
			   push_items * sizeof(struct btrfs_key_ptr));
1791

1792
	if (push_items < src_nritems) {
1793 1794 1795 1796 1797 1798 1799 1800 1801
		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
				      btrfs_node_key_ptr_offset(push_items),
				      (src_nritems - push_items) *
				      sizeof(struct btrfs_key_ptr));
	}
	btrfs_set_header_nritems(src, src_nritems - push_items);
	btrfs_set_header_nritems(dst, dst_nritems + push_items);
	btrfs_mark_buffer_dirty(src);
	btrfs_mark_buffer_dirty(dst);
Z
Zheng Yan 已提交
1802 1803 1804 1805

	ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
	BUG_ON(ret);

1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817
	return ret;
}

/*
 * try to push data from one node into the next node right in the
 * tree.
 *
 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
 * error, and > 0 if there was no room in the right hand block.
 *
 * this will  only push up to 1/2 the contents of the left node over
 */
1818 1819 1820 1821
static int balance_node_right(struct btrfs_trans_handle *trans,
			      struct btrfs_root *root,
			      struct extent_buffer *dst,
			      struct extent_buffer *src)
1822 1823 1824 1825 1826 1827 1828
{
	int push_items = 0;
	int max_push;
	int src_nritems;
	int dst_nritems;
	int ret = 0;

1829 1830 1831
	WARN_ON(btrfs_header_generation(src) != trans->transid);
	WARN_ON(btrfs_header_generation(dst) != trans->transid);

1832 1833
	src_nritems = btrfs_header_nritems(src);
	dst_nritems = btrfs_header_nritems(dst);
C
Chris Mason 已提交
1834
	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
C
Chris Mason 已提交
1835
	if (push_items <= 0)
1836
		return 1;
1837

C
Chris Mason 已提交
1838
	if (src_nritems < 4)
1839
		return 1;
1840 1841 1842

	max_push = src_nritems / 2 + 1;
	/* don't try to empty the node */
C
Chris Mason 已提交
1843
	if (max_push >= src_nritems)
1844
		return 1;
Y
Yan 已提交
1845

1846 1847 1848
	if (max_push < push_items)
		push_items = max_push;

1849 1850 1851 1852
	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
				      btrfs_node_key_ptr_offset(0),
				      (dst_nritems) *
				      sizeof(struct btrfs_key_ptr));
C
Chris Mason 已提交
1853

1854 1855 1856
	copy_extent_buffer(dst, src,
			   btrfs_node_key_ptr_offset(0),
			   btrfs_node_key_ptr_offset(src_nritems - push_items),
C
Chris Mason 已提交
1857
			   push_items * sizeof(struct btrfs_key_ptr));
1858

1859 1860
	btrfs_set_header_nritems(src, src_nritems - push_items);
	btrfs_set_header_nritems(dst, dst_nritems + push_items);
1861

1862 1863
	btrfs_mark_buffer_dirty(src);
	btrfs_mark_buffer_dirty(dst);
Z
Zheng Yan 已提交
1864 1865 1866 1867

	ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
	BUG_ON(ret);

C
Chris Mason 已提交
1868
	return ret;
1869 1870
}

C
Chris Mason 已提交
1871 1872 1873 1874
/*
 * helper function to insert a new root level in the tree.
 * A new node is allocated, and a single item is inserted to
 * point to the existing root
C
Chris Mason 已提交
1875 1876
 *
 * returns zero on success or < 0 on failure.
C
Chris Mason 已提交
1877
 */
C
Chris Mason 已提交
1878
static noinline int insert_new_root(struct btrfs_trans_handle *trans,
1879 1880
			   struct btrfs_root *root,
			   struct btrfs_path *path, int level)
C
Chris Mason 已提交
1881
{
1882
	u64 lower_gen;
1883 1884
	struct extent_buffer *lower;
	struct extent_buffer *c;
1885
	struct extent_buffer *old;
1886
	struct btrfs_disk_key lower_key;
Z
Zheng Yan 已提交
1887
	int ret;
C
Chris Mason 已提交
1888 1889 1890 1891

	BUG_ON(path->nodes[level]);
	BUG_ON(path->nodes[level-1] != root->node);

1892 1893 1894 1895 1896 1897
	lower = path->nodes[level-1];
	if (level == 1)
		btrfs_item_key(lower, &lower_key, 0);
	else
		btrfs_node_key(lower, &lower_key, 0);

Z
Zheng Yan 已提交
1898 1899
	c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
				   root->root_key.objectid, trans->transid,
1900
				   level, root->node->start, 0);
1901 1902
	if (IS_ERR(c))
		return PTR_ERR(c);
1903

1904 1905 1906
	memset_extent_buffer(c, 0, 0, root->nodesize);
	btrfs_set_header_nritems(c, 1);
	btrfs_set_header_level(c, level);
1907
	btrfs_set_header_bytenr(c, c->start);
1908 1909 1910 1911 1912 1913
	btrfs_set_header_generation(c, trans->transid);
	btrfs_set_header_owner(c, root->root_key.objectid);

	write_extent_buffer(c, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(c),
			    BTRFS_FSID_SIZE);
1914 1915 1916 1917 1918

	write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
			    (unsigned long)btrfs_header_chunk_tree_uuid(c),
			    BTRFS_UUID_SIZE);

1919
	btrfs_set_node_key(c, &lower_key, 0);
1920
	btrfs_set_node_blockptr(c, 0, lower->start);
1921
	lower_gen = btrfs_header_generation(lower);
Z
Zheng Yan 已提交
1922
	WARN_ON(lower_gen != trans->transid);
1923 1924

	btrfs_set_node_ptr_generation(c, 0, lower_gen);
1925

1926
	btrfs_mark_buffer_dirty(c);
1927

1928 1929
	spin_lock(&root->node_lock);
	old = root->node;
1930
	root->node = c;
1931 1932
	spin_unlock(&root->node_lock);

Z
Zheng Yan 已提交
1933 1934 1935
	ret = btrfs_update_extent_ref(trans, root, lower->start,
				      lower->start, c->start,
				      root->root_key.objectid,
1936
				      trans->transid, level - 1);
Z
Zheng Yan 已提交
1937 1938
	BUG_ON(ret);

1939 1940 1941
	/* the super has an extra ref to root->node */
	free_extent_buffer(old);

1942
	add_root_to_dirty_list(root);
1943 1944
	extent_buffer_get(c);
	path->nodes[level] = c;
1945
	path->locks[level] = 1;
C
Chris Mason 已提交
1946 1947 1948 1949
	path->slots[level] = 0;
	return 0;
}

C
Chris Mason 已提交
1950 1951 1952
/*
 * worker function to insert a single pointer in a node.
 * the node should have enough room for the pointer already
C
Chris Mason 已提交
1953
 *
C
Chris Mason 已提交
1954 1955
 * slot and level indicate where you want the key to go, and
 * blocknr is the block the key points to.
C
Chris Mason 已提交
1956 1957
 *
 * returns zero on success and < 0 on any error
C
Chris Mason 已提交
1958
 */
1959 1960
static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
		      *root, struct btrfs_path *path, struct btrfs_disk_key
1961
		      *key, u64 bytenr, int slot, int level)
C
Chris Mason 已提交
1962
{
1963
	struct extent_buffer *lower;
C
Chris Mason 已提交
1964
	int nritems;
C
Chris Mason 已提交
1965 1966

	BUG_ON(!path->nodes[level]);
1967 1968
	lower = path->nodes[level];
	nritems = btrfs_header_nritems(lower);
C
Chris Mason 已提交
1969 1970
	if (slot > nritems)
		BUG();
C
Chris Mason 已提交
1971
	if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
C
Chris Mason 已提交
1972 1973
		BUG();
	if (slot != nritems) {
1974 1975 1976
		memmove_extent_buffer(lower,
			      btrfs_node_key_ptr_offset(slot + 1),
			      btrfs_node_key_ptr_offset(slot),
C
Chris Mason 已提交
1977
			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
C
Chris Mason 已提交
1978
	}
1979
	btrfs_set_node_key(lower, key, slot);
1980
	btrfs_set_node_blockptr(lower, slot, bytenr);
1981 1982
	WARN_ON(trans->transid == 0);
	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
1983 1984
	btrfs_set_header_nritems(lower, nritems + 1);
	btrfs_mark_buffer_dirty(lower);
C
Chris Mason 已提交
1985 1986 1987
	return 0;
}

C
Chris Mason 已提交
1988 1989 1990 1991 1992 1993
/*
 * split the node at the specified level in path in two.
 * The path is corrected to point to the appropriate node after the split
 *
 * Before splitting this tries to make some room in the node by pushing
 * left and right, if either one works, it returns right away.
C
Chris Mason 已提交
1994 1995
 *
 * returns 0 on success and < 0 on failure
C
Chris Mason 已提交
1996
 */
1997 1998 1999
static noinline int split_node(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root,
			       struct btrfs_path *path, int level)
2000
{
2001 2002 2003
	struct extent_buffer *c;
	struct extent_buffer *split;
	struct btrfs_disk_key disk_key;
2004
	int mid;
C
Chris Mason 已提交
2005
	int ret;
C
Chris Mason 已提交
2006
	int wret;
2007
	u32 c_nritems;
2008

2009
	c = path->nodes[level];
2010
	WARN_ON(btrfs_header_generation(c) != trans->transid);
2011
	if (c == root->node) {
C
Chris Mason 已提交
2012
		/* trying to split the root, lets make a new one */
2013
		ret = insert_new_root(trans, root, path, level + 1);
C
Chris Mason 已提交
2014 2015
		if (ret)
			return ret;
2016 2017
	} else {
		ret = push_nodes_for_insert(trans, root, path, level);
2018 2019
		c = path->nodes[level];
		if (!ret && btrfs_header_nritems(c) <
2020
		    BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
2021
			return 0;
2022 2023
		if (ret < 0)
			return ret;
2024
	}
2025

2026
	c_nritems = btrfs_header_nritems(c);
2027

2028
	split = btrfs_alloc_free_block(trans, root, root->nodesize,
Z
Zheng Yan 已提交
2029 2030 2031
					path->nodes[level + 1]->start,
					root->root_key.objectid,
					trans->transid, level, c->start, 0);
2032 2033 2034 2035 2036
	if (IS_ERR(split))
		return PTR_ERR(split);

	btrfs_set_header_flags(split, btrfs_header_flags(c));
	btrfs_set_header_level(split, btrfs_header_level(c));
2037
	btrfs_set_header_bytenr(split, split->start);
2038 2039
	btrfs_set_header_generation(split, trans->transid);
	btrfs_set_header_owner(split, root->root_key.objectid);
2040
	btrfs_set_header_flags(split, 0);
2041 2042 2043
	write_extent_buffer(split, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(split),
			    BTRFS_FSID_SIZE);
2044 2045 2046
	write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
			    (unsigned long)btrfs_header_chunk_tree_uuid(split),
			    BTRFS_UUID_SIZE);
2047

2048
	mid = (c_nritems + 1) / 2;
2049 2050 2051 2052 2053 2054 2055

	copy_extent_buffer(split, c,
			   btrfs_node_key_ptr_offset(0),
			   btrfs_node_key_ptr_offset(mid),
			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
	btrfs_set_header_nritems(split, c_nritems - mid);
	btrfs_set_header_nritems(c, mid);
C
Chris Mason 已提交
2056 2057
	ret = 0;

2058 2059 2060 2061
	btrfs_mark_buffer_dirty(c);
	btrfs_mark_buffer_dirty(split);

	btrfs_node_key(split, &disk_key, 0);
2062
	wret = insert_ptr(trans, root, path, &disk_key, split->start,
2063
			  path->slots[level + 1] + 1,
C
Chris Mason 已提交
2064
			  level + 1);
C
Chris Mason 已提交
2065 2066 2067
	if (wret)
		ret = wret;

Z
Zheng Yan 已提交
2068 2069 2070
	ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
	BUG_ON(ret);

C
Chris Mason 已提交
2071
	if (path->slots[level] >= mid) {
C
Chris Mason 已提交
2072
		path->slots[level] -= mid;
2073
		btrfs_tree_unlock(c);
2074 2075
		free_extent_buffer(c);
		path->nodes[level] = split;
C
Chris Mason 已提交
2076 2077
		path->slots[level + 1] += 1;
	} else {
2078
		btrfs_tree_unlock(split);
2079
		free_extent_buffer(split);
2080
	}
C
Chris Mason 已提交
2081
	return ret;
2082 2083
}

C
Chris Mason 已提交
2084 2085 2086 2087 2088
/*
 * how many bytes are required to store the items in a leaf.  start
 * and nr indicate which items in the leaf to check.  This totals up the
 * space used both by the item structs and the item data
 */
2089
static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2090 2091
{
	int data_len;
2092
	int nritems = btrfs_header_nritems(l);
2093
	int end = min(nritems, start + nr) - 1;
2094 2095 2096

	if (!nr)
		return 0;
2097 2098
	data_len = btrfs_item_end_nr(l, start);
	data_len = data_len - btrfs_item_offset_nr(l, end);
C
Chris Mason 已提交
2099
	data_len += sizeof(struct btrfs_item) * nr;
2100
	WARN_ON(data_len < 0);
2101 2102 2103
	return data_len;
}

2104 2105 2106 2107 2108
/*
 * The space between the end of the leaf items and
 * the start of the leaf data.  IOW, how much room
 * the leaf has left for both items and data
 */
C
Chris Mason 已提交
2109
noinline int btrfs_leaf_free_space(struct btrfs_root *root,
2110
				   struct extent_buffer *leaf)
2111
{
2112 2113 2114 2115
	int nritems = btrfs_header_nritems(leaf);
	int ret;
	ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
	if (ret < 0) {
C
Chris Mason 已提交
2116 2117
		printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
		       "used %d nritems %d\n",
J
Jens Axboe 已提交
2118
		       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
2119 2120 2121
		       leaf_space_used(leaf, 0, nritems), nritems);
	}
	return ret;
2122 2123
}

C
Chris Mason 已提交
2124 2125 2126
/*
 * push some data in the path leaf to the right, trying to free up at
 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
C
Chris Mason 已提交
2127 2128 2129
 *
 * returns 1 if the push failed because the other node didn't have enough
 * room, 0 if everything worked out and < 0 if there were major errors.
C
Chris Mason 已提交
2130
 */
2131
static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2132 2133
			   *root, struct btrfs_path *path, int data_size,
			   int empty)
C
Chris Mason 已提交
2134
{
2135 2136 2137 2138
	struct extent_buffer *left = path->nodes[0];
	struct extent_buffer *right;
	struct extent_buffer *upper;
	struct btrfs_disk_key disk_key;
C
Chris Mason 已提交
2139
	int slot;
2140
	u32 i;
C
Chris Mason 已提交
2141 2142 2143
	int free_space;
	int push_space = 0;
	int push_items = 0;
C
Chris Mason 已提交
2144
	struct btrfs_item *item;
2145
	u32 left_nritems;
2146
	u32 nr;
2147
	u32 right_nritems;
2148
	u32 data_end;
2149
	u32 this_item_size;
2150
	int ret;
C
Chris Mason 已提交
2151 2152

	slot = path->slots[1];
C
Chris Mason 已提交
2153
	if (!path->nodes[1])
C
Chris Mason 已提交
2154
		return 1;
C
Chris Mason 已提交
2155

C
Chris Mason 已提交
2156
	upper = path->nodes[1];
2157
	if (slot >= btrfs_header_nritems(upper) - 1)
C
Chris Mason 已提交
2158
		return 1;
2159

2160 2161
	WARN_ON(!btrfs_tree_locked(path->nodes[1]));

2162
	right = read_node_slot(root, upper, slot + 1);
2163
	btrfs_tree_lock(right);
C
Chris Mason 已提交
2164
	free_space = btrfs_leaf_free_space(root, right);
2165
	if (free_space < data_size)
2166
		goto out_unlock;
2167

C
Chris Mason 已提交
2168
	/* cow and double check */
2169
	ret = btrfs_cow_block(trans, root, right, upper,
2170
			      slot + 1, &right, 0);
2171 2172 2173
	if (ret)
		goto out_unlock;

C
Chris Mason 已提交
2174
	free_space = btrfs_leaf_free_space(root, right);
2175
	if (free_space < data_size)
2176
		goto out_unlock;
C
Chris Mason 已提交
2177

2178
	left_nritems = btrfs_header_nritems(left);
2179 2180
	if (left_nritems == 0)
		goto out_unlock;
2181

2182 2183 2184 2185 2186
	if (empty)
		nr = 0;
	else
		nr = 1;

Z
Zheng Yan 已提交
2187
	if (path->slots[0] >= left_nritems)
2188
		push_space += data_size;
Z
Zheng Yan 已提交
2189

2190 2191
	i = left_nritems - 1;
	while (i >= nr) {
2192
		item = btrfs_item_nr(left, i);
2193

Z
Zheng Yan 已提交
2194 2195 2196 2197 2198 2199 2200 2201 2202 2203
		if (!empty && push_items > 0) {
			if (path->slots[0] > i)
				break;
			if (path->slots[0] == i) {
				int space = btrfs_leaf_free_space(root, left);
				if (space + push_space * 2 > free_space)
					break;
			}
		}

C
Chris Mason 已提交
2204
		if (path->slots[0] == i)
2205
			push_space += data_size;
2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216

		if (!left->map_token) {
			map_extent_buffer(left, (unsigned long)item,
					sizeof(struct btrfs_item),
					&left->map_token, &left->kaddr,
					&left->map_start, &left->map_len,
					KM_USER1);
		}

		this_item_size = btrfs_item_size(left, item);
		if (this_item_size + sizeof(*item) + push_space > free_space)
C
Chris Mason 已提交
2217
			break;
Z
Zheng Yan 已提交
2218

C
Chris Mason 已提交
2219
		push_items++;
2220
		push_space += this_item_size + sizeof(*item);
2221 2222 2223
		if (i == 0)
			break;
		i--;
2224 2225 2226 2227
	}
	if (left->map_token) {
		unmap_extent_buffer(left, left->map_token, KM_USER1);
		left->map_token = NULL;
C
Chris Mason 已提交
2228
	}
2229

2230 2231
	if (push_items == 0)
		goto out_unlock;
2232

2233
	if (!empty && push_items == left_nritems)
2234
		WARN_ON(1);
2235

C
Chris Mason 已提交
2236
	/* push left to right */
2237
	right_nritems = btrfs_header_nritems(right);
2238

2239
	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
C
Chris Mason 已提交
2240
	push_space -= leaf_data_end(root, left);
2241

C
Chris Mason 已提交
2242
	/* make room in the right data area */
2243 2244 2245 2246 2247 2248
	data_end = leaf_data_end(root, right);
	memmove_extent_buffer(right,
			      btrfs_leaf_data(right) + data_end - push_space,
			      btrfs_leaf_data(right) + data_end,
			      BTRFS_LEAF_DATA_SIZE(root) - data_end);

C
Chris Mason 已提交
2249
	/* copy from the left data area */
2250
	copy_extent_buffer(right, left, btrfs_leaf_data(right) +
C
Chris Mason 已提交
2251 2252 2253
		     BTRFS_LEAF_DATA_SIZE(root) - push_space,
		     btrfs_leaf_data(left) + leaf_data_end(root, left),
		     push_space);
2254 2255 2256 2257 2258

	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
			      btrfs_item_nr_offset(0),
			      right_nritems * sizeof(struct btrfs_item));

C
Chris Mason 已提交
2259
	/* copy the items from left to right */
2260 2261 2262
	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
		   btrfs_item_nr_offset(left_nritems - push_items),
		   push_items * sizeof(struct btrfs_item));
C
Chris Mason 已提交
2263 2264

	/* update the item pointers */
2265
	right_nritems += push_items;
2266
	btrfs_set_header_nritems(right, right_nritems);
C
Chris Mason 已提交
2267
	push_space = BTRFS_LEAF_DATA_SIZE(root);
2268
	for (i = 0; i < right_nritems; i++) {
2269
		item = btrfs_item_nr(right, i);
2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283
		if (!right->map_token) {
			map_extent_buffer(right, (unsigned long)item,
					sizeof(struct btrfs_item),
					&right->map_token, &right->kaddr,
					&right->map_start, &right->map_len,
					KM_USER1);
		}
		push_space -= btrfs_item_size(right, item);
		btrfs_set_item_offset(right, item, push_space);
	}

	if (right->map_token) {
		unmap_extent_buffer(right, right->map_token, KM_USER1);
		right->map_token = NULL;
C
Chris Mason 已提交
2284
	}
2285
	left_nritems -= push_items;
2286
	btrfs_set_header_nritems(left, left_nritems);
C
Chris Mason 已提交
2287

2288 2289
	if (left_nritems)
		btrfs_mark_buffer_dirty(left);
2290
	btrfs_mark_buffer_dirty(right);
2291

Z
Zheng Yan 已提交
2292 2293 2294
	ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
	BUG_ON(ret);

2295 2296
	btrfs_item_key(right, &disk_key, 0);
	btrfs_set_node_key(upper, &disk_key, slot + 1);
C
Chris Mason 已提交
2297
	btrfs_mark_buffer_dirty(upper);
C
Chris Mason 已提交
2298

C
Chris Mason 已提交
2299
	/* then fixup the leaf pointer in the path */
2300 2301
	if (path->slots[0] >= left_nritems) {
		path->slots[0] -= left_nritems;
2302 2303 2304
		if (btrfs_header_nritems(path->nodes[0]) == 0)
			clean_tree_block(trans, root, path->nodes[0]);
		btrfs_tree_unlock(path->nodes[0]);
2305 2306
		free_extent_buffer(path->nodes[0]);
		path->nodes[0] = right;
C
Chris Mason 已提交
2307 2308
		path->slots[1] += 1;
	} else {
2309
		btrfs_tree_unlock(right);
2310
		free_extent_buffer(right);
C
Chris Mason 已提交
2311 2312
	}
	return 0;
2313 2314 2315 2316 2317

out_unlock:
	btrfs_tree_unlock(right);
	free_extent_buffer(right);
	return 1;
C
Chris Mason 已提交
2318
}
2319

C
Chris Mason 已提交
2320 2321 2322 2323
/*
 * push some data in the path leaf to the left, trying to free up at
 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
 */
2324
static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2325 2326
			  *root, struct btrfs_path *path, int data_size,
			  int empty)
2327
{
2328 2329 2330
	struct btrfs_disk_key disk_key;
	struct extent_buffer *right = path->nodes[0];
	struct extent_buffer *left;
2331 2332 2333 2334 2335
	int slot;
	int i;
	int free_space;
	int push_space = 0;
	int push_items = 0;
C
Chris Mason 已提交
2336
	struct btrfs_item *item;
2337
	u32 old_left_nritems;
2338
	u32 right_nritems;
2339
	u32 nr;
C
Chris Mason 已提交
2340 2341
	int ret = 0;
	int wret;
2342 2343
	u32 this_item_size;
	u32 old_left_item_size;
2344 2345

	slot = path->slots[1];
2346
	if (slot == 0)
2347
		return 1;
2348
	if (!path->nodes[1])
2349
		return 1;
2350

2351
	right_nritems = btrfs_header_nritems(right);
C
Chris Mason 已提交
2352
	if (right_nritems == 0)
2353 2354
		return 1;

2355 2356
	WARN_ON(!btrfs_tree_locked(path->nodes[1]));

2357
	left = read_node_slot(root, path->nodes[1], slot - 1);
2358
	btrfs_tree_lock(left);
C
Chris Mason 已提交
2359
	free_space = btrfs_leaf_free_space(root, left);
2360
	if (free_space < data_size) {
2361 2362
		ret = 1;
		goto out;
2363
	}
C
Chris Mason 已提交
2364 2365

	/* cow and double check */
2366
	ret = btrfs_cow_block(trans, root, left,
2367
			      path->nodes[1], slot - 1, &left, 0);
2368 2369
	if (ret) {
		/* we hit -ENOSPC, but it isn't fatal here */
2370 2371
		ret = 1;
		goto out;
2372
	}
2373

C
Chris Mason 已提交
2374
	free_space = btrfs_leaf_free_space(root, left);
2375
	if (free_space < data_size) {
2376 2377
		ret = 1;
		goto out;
C
Chris Mason 已提交
2378 2379
	}

2380 2381 2382 2383 2384 2385
	if (empty)
		nr = right_nritems;
	else
		nr = right_nritems - 1;

	for (i = 0; i < nr; i++) {
2386
		item = btrfs_item_nr(right, i);
2387 2388 2389 2390 2391 2392 2393 2394
		if (!right->map_token) {
			map_extent_buffer(right, (unsigned long)item,
					sizeof(struct btrfs_item),
					&right->map_token, &right->kaddr,
					&right->map_start, &right->map_len,
					KM_USER1);
		}

Z
Zheng Yan 已提交
2395 2396 2397 2398 2399 2400 2401 2402 2403 2404
		if (!empty && push_items > 0) {
			if (path->slots[0] < i)
				break;
			if (path->slots[0] == i) {
				int space = btrfs_leaf_free_space(root, right);
				if (space + push_space * 2 > free_space)
					break;
			}
		}

2405
		if (path->slots[0] == i)
2406
			push_space += data_size;
2407 2408 2409

		this_item_size = btrfs_item_size(right, item);
		if (this_item_size + sizeof(*item) + push_space > free_space)
2410
			break;
2411

2412
		push_items++;
2413 2414 2415 2416 2417 2418
		push_space += this_item_size + sizeof(*item);
	}

	if (right->map_token) {
		unmap_extent_buffer(right, right->map_token, KM_USER1);
		right->map_token = NULL;
2419
	}
2420

2421
	if (push_items == 0) {
2422 2423
		ret = 1;
		goto out;
2424
	}
2425
	if (!empty && push_items == btrfs_header_nritems(right))
2426
		WARN_ON(1);
2427

2428
	/* push data from right to left */
2429 2430 2431 2432 2433
	copy_extent_buffer(left, right,
			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
			   btrfs_item_nr_offset(0),
			   push_items * sizeof(struct btrfs_item));

C
Chris Mason 已提交
2434
	push_space = BTRFS_LEAF_DATA_SIZE(root) -
C
Chris Mason 已提交
2435
		     btrfs_item_offset_nr(right, push_items - 1);
2436 2437

	copy_extent_buffer(left, right, btrfs_leaf_data(left) +
C
Chris Mason 已提交
2438 2439
		     leaf_data_end(root, left) - push_space,
		     btrfs_leaf_data(right) +
2440
		     btrfs_item_offset_nr(right, push_items - 1),
C
Chris Mason 已提交
2441
		     push_space);
2442
	old_left_nritems = btrfs_header_nritems(left);
2443
	BUG_ON(old_left_nritems <= 0);
2444

2445
	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
C
Chris Mason 已提交
2446
	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2447
		u32 ioff;
2448

2449
		item = btrfs_item_nr(left, i);
2450 2451 2452 2453 2454 2455 2456 2457
		if (!left->map_token) {
			map_extent_buffer(left, (unsigned long)item,
					sizeof(struct btrfs_item),
					&left->map_token, &left->kaddr,
					&left->map_start, &left->map_len,
					KM_USER1);
		}

2458 2459
		ioff = btrfs_item_offset(left, item);
		btrfs_set_item_offset(left, item,
2460
		      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
2461
	}
2462
	btrfs_set_header_nritems(left, old_left_nritems + push_items);
2463 2464 2465 2466
	if (left->map_token) {
		unmap_extent_buffer(left, left->map_token, KM_USER1);
		left->map_token = NULL;
	}
2467 2468

	/* fixup right node */
2469
	if (push_items > right_nritems) {
C
Chris Mason 已提交
2470 2471
		printk(KERN_CRIT "push items %d nr %u\n", push_items,
		       right_nritems);
2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483
		WARN_ON(1);
	}

	if (push_items < right_nritems) {
		push_space = btrfs_item_offset_nr(right, push_items - 1) -
						  leaf_data_end(root, right);
		memmove_extent_buffer(right, btrfs_leaf_data(right) +
				      BTRFS_LEAF_DATA_SIZE(root) - push_space,
				      btrfs_leaf_data(right) +
				      leaf_data_end(root, right), push_space);

		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
2484 2485 2486
			      btrfs_item_nr_offset(push_items),
			     (btrfs_header_nritems(right) - push_items) *
			     sizeof(struct btrfs_item));
2487
	}
2488 2489
	right_nritems -= push_items;
	btrfs_set_header_nritems(right, right_nritems);
C
Chris Mason 已提交
2490
	push_space = BTRFS_LEAF_DATA_SIZE(root);
2491 2492
	for (i = 0; i < right_nritems; i++) {
		item = btrfs_item_nr(right, i);
2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507

		if (!right->map_token) {
			map_extent_buffer(right, (unsigned long)item,
					sizeof(struct btrfs_item),
					&right->map_token, &right->kaddr,
					&right->map_start, &right->map_len,
					KM_USER1);
		}

		push_space = push_space - btrfs_item_size(right, item);
		btrfs_set_item_offset(right, item, push_space);
	}
	if (right->map_token) {
		unmap_extent_buffer(right, right->map_token, KM_USER1);
		right->map_token = NULL;
2508
	}
2509

2510
	btrfs_mark_buffer_dirty(left);
2511 2512
	if (right_nritems)
		btrfs_mark_buffer_dirty(right);
2513

Z
Zheng Yan 已提交
2514 2515 2516 2517
	ret = btrfs_update_ref(trans, root, right, left,
			       old_left_nritems, push_items);
	BUG_ON(ret);

2518 2519
	btrfs_item_key(right, &disk_key, 0);
	wret = fixup_low_keys(trans, root, path, &disk_key, 1);
C
Chris Mason 已提交
2520 2521
	if (wret)
		ret = wret;
2522 2523 2524 2525

	/* then fixup the leaf pointer in the path */
	if (path->slots[0] < push_items) {
		path->slots[0] += old_left_nritems;
2526 2527 2528
		if (btrfs_header_nritems(path->nodes[0]) == 0)
			clean_tree_block(trans, root, path->nodes[0]);
		btrfs_tree_unlock(path->nodes[0]);
2529 2530
		free_extent_buffer(path->nodes[0]);
		path->nodes[0] = left;
2531 2532
		path->slots[1] -= 1;
	} else {
2533
		btrfs_tree_unlock(left);
2534
		free_extent_buffer(left);
2535 2536
		path->slots[0] -= push_items;
	}
2537
	BUG_ON(path->slots[0] < 0);
C
Chris Mason 已提交
2538
	return ret;
2539 2540 2541 2542
out:
	btrfs_tree_unlock(left);
	free_extent_buffer(left);
	return ret;
2543 2544
}

C
Chris Mason 已提交
2545 2546 2547
/*
 * split the path's leaf in two, making sure there is at least data_size
 * available for the resulting leaf level of the path.
C
Chris Mason 已提交
2548 2549
 *
 * returns 0 if all went well and < 0 on failure.
C
Chris Mason 已提交
2550
 */
2551 2552 2553 2554 2555
static noinline int split_leaf(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root,
			       struct btrfs_key *ins_key,
			       struct btrfs_path *path, int data_size,
			       int extend)
2556
{
2557
	struct extent_buffer *l;
2558
	u32 nritems;
2559 2560
	int mid;
	int slot;
2561
	struct extent_buffer *right;
2562 2563 2564
	int data_copy_size;
	int rt_data_off;
	int i;
2565
	int ret = 0;
C
Chris Mason 已提交
2566
	int wret;
2567 2568
	int double_split;
	int num_doubles = 0;
2569
	struct btrfs_disk_key disk_key;
C
Chris Mason 已提交
2570

C
Chris Mason 已提交
2571
	/* first try to make some room by pushing left and right */
2572
	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
2573
		wret = push_leaf_right(trans, root, path, data_size, 0);
C
Chris Mason 已提交
2574
		if (wret < 0)
C
Chris Mason 已提交
2575
			return wret;
2576
		if (wret) {
2577
			wret = push_leaf_left(trans, root, path, data_size, 0);
2578 2579 2580 2581
			if (wret < 0)
				return wret;
		}
		l = path->nodes[0];
C
Chris Mason 已提交
2582

2583
		/* did the pushes work? */
2584
		if (btrfs_leaf_free_space(root, l) >= data_size)
2585
			return 0;
2586
	}
C
Chris Mason 已提交
2587

C
Chris Mason 已提交
2588
	if (!path->nodes[1]) {
2589
		ret = insert_new_root(trans, root, path, 1);
C
Chris Mason 已提交
2590 2591 2592
		if (ret)
			return ret;
	}
2593 2594 2595
again:
	double_split = 0;
	l = path->nodes[0];
2596
	slot = path->slots[0];
2597
	nritems = btrfs_header_nritems(l);
C
Chris Mason 已提交
2598
	mid = (nritems + 1) / 2;
2599

2600
	right = btrfs_alloc_free_block(trans, root, root->leafsize,
Z
Zheng Yan 已提交
2601 2602 2603
					path->nodes[1]->start,
					root->root_key.objectid,
					trans->transid, 0, l->start, 0);
2604 2605
	if (IS_ERR(right)) {
		BUG_ON(1);
2606
		return PTR_ERR(right);
2607
	}
2608 2609

	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2610
	btrfs_set_header_bytenr(right, right->start);
2611 2612 2613 2614 2615 2616
	btrfs_set_header_generation(right, trans->transid);
	btrfs_set_header_owner(right, root->root_key.objectid);
	btrfs_set_header_level(right, 0);
	write_extent_buffer(right, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(right),
			    BTRFS_FSID_SIZE);
2617 2618 2619 2620

	write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
			    (unsigned long)btrfs_header_chunk_tree_uuid(right),
			    BTRFS_UUID_SIZE);
2621 2622
	if (mid <= slot) {
		if (nritems == 1 ||
2623
		    leaf_space_used(l, mid, nritems - mid) + data_size >
2624 2625 2626
			BTRFS_LEAF_DATA_SIZE(root)) {
			if (slot >= nritems) {
				btrfs_cpu_key_to_disk(&disk_key, ins_key);
2627
				btrfs_set_header_nritems(right, 0);
2628
				wret = insert_ptr(trans, root, path,
2629
						  &disk_key, right->start,
2630 2631 2632
						  path->slots[1] + 1, 1);
				if (wret)
					ret = wret;
2633 2634

				btrfs_tree_unlock(path->nodes[0]);
2635 2636
				free_extent_buffer(path->nodes[0]);
				path->nodes[0] = right;
2637 2638
				path->slots[0] = 0;
				path->slots[1] += 1;
2639
				btrfs_mark_buffer_dirty(right);
2640 2641 2642
				return ret;
			}
			mid = slot;
2643 2644
			if (mid != nritems &&
			    leaf_space_used(l, mid, nritems - mid) +
2645
			    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2646 2647
				double_split = 1;
			}
2648 2649
		}
	} else {
2650
		if (leaf_space_used(l, 0, mid) + data_size >
2651
			BTRFS_LEAF_DATA_SIZE(root)) {
2652
			if (!extend && data_size && slot == 0) {
2653
				btrfs_cpu_key_to_disk(&disk_key, ins_key);
2654
				btrfs_set_header_nritems(right, 0);
2655 2656
				wret = insert_ptr(trans, root, path,
						  &disk_key,
2657
						  right->start,
2658
						  path->slots[1], 1);
2659 2660
				if (wret)
					ret = wret;
2661
				btrfs_tree_unlock(path->nodes[0]);
2662 2663
				free_extent_buffer(path->nodes[0]);
				path->nodes[0] = right;
2664
				path->slots[0] = 0;
2665 2666
				if (path->slots[1] == 0) {
					wret = fixup_low_keys(trans, root,
C
Chris Mason 已提交
2667
						      path, &disk_key, 1);
2668 2669 2670
					if (wret)
						ret = wret;
				}
2671
				btrfs_mark_buffer_dirty(right);
2672
				return ret;
2673
			} else if ((extend || !data_size) && slot == 0) {
2674 2675 2676 2677 2678
				mid = 1;
			} else {
				mid = slot;
				if (mid != nritems &&
				    leaf_space_used(l, mid, nritems - mid) +
2679
				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2680 2681
					double_split = 1;
				}
2682
			}
2683 2684
		}
	}
2685 2686 2687 2688 2689 2690 2691 2692 2693
	nritems = nritems - mid;
	btrfs_set_header_nritems(right, nritems);
	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);

	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
			   btrfs_item_nr_offset(mid),
			   nritems * sizeof(struct btrfs_item));

	copy_extent_buffer(right, l,
C
Chris Mason 已提交
2694 2695 2696
		     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
		     data_copy_size, btrfs_leaf_data(l) +
		     leaf_data_end(root, l), data_copy_size);
2697

C
Chris Mason 已提交
2698
	rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2699
		      btrfs_item_end_nr(l, mid);
C
Chris Mason 已提交
2700

2701 2702
	for (i = 0; i < nritems; i++) {
		struct btrfs_item *item = btrfs_item_nr(right, i);
2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713
		u32 ioff;

		if (!right->map_token) {
			map_extent_buffer(right, (unsigned long)item,
					sizeof(struct btrfs_item),
					&right->map_token, &right->kaddr,
					&right->map_start, &right->map_len,
					KM_USER1);
		}

		ioff = btrfs_item_offset(right, item);
2714
		btrfs_set_item_offset(right, item, ioff + rt_data_off);
C
Chris Mason 已提交
2715
	}
C
Chris Mason 已提交
2716

2717 2718 2719 2720 2721
	if (right->map_token) {
		unmap_extent_buffer(right, right->map_token, KM_USER1);
		right->map_token = NULL;
	}

2722
	btrfs_set_header_nritems(l, mid);
C
Chris Mason 已提交
2723
	ret = 0;
2724
	btrfs_item_key(right, &disk_key, 0);
2725 2726
	wret = insert_ptr(trans, root, path, &disk_key, right->start,
			  path->slots[1] + 1, 1);
C
Chris Mason 已提交
2727 2728
	if (wret)
		ret = wret;
2729 2730 2731

	btrfs_mark_buffer_dirty(right);
	btrfs_mark_buffer_dirty(l);
2732
	BUG_ON(path->slots[0] != slot);
2733

Z
Zheng Yan 已提交
2734 2735 2736
	ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
	BUG_ON(ret);

2737
	if (mid <= slot) {
2738
		btrfs_tree_unlock(path->nodes[0]);
2739 2740
		free_extent_buffer(path->nodes[0]);
		path->nodes[0] = right;
2741 2742
		path->slots[0] -= mid;
		path->slots[1] += 1;
2743 2744
	} else {
		btrfs_tree_unlock(right);
2745
		free_extent_buffer(right);
2746
	}
2747

2748
	BUG_ON(path->slots[0] < 0);
2749

2750 2751 2752 2753
	if (double_split) {
		BUG_ON(num_doubles != 0);
		num_doubles++;
		goto again;
2754
	}
2755 2756 2757
	return ret;
}

2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811
/*
 * This function splits a single item into two items,
 * giving 'new_key' to the new item and splitting the
 * old one at split_offset (from the start of the item).
 *
 * The path may be released by this operation.  After
 * the split, the path is pointing to the old item.  The
 * new item is going to be in the same node as the old one.
 *
 * Note, the item being split must be smaller enough to live alone on
 * a tree block with room for one extra struct btrfs_item
 *
 * This allows us to split the item in place, keeping a lock on the
 * leaf the entire time.
 */
int btrfs_split_item(struct btrfs_trans_handle *trans,
		     struct btrfs_root *root,
		     struct btrfs_path *path,
		     struct btrfs_key *new_key,
		     unsigned long split_offset)
{
	u32 item_size;
	struct extent_buffer *leaf;
	struct btrfs_key orig_key;
	struct btrfs_item *item;
	struct btrfs_item *new_item;
	int ret = 0;
	int slot;
	u32 nritems;
	u32 orig_offset;
	struct btrfs_disk_key disk_key;
	char *buf;

	leaf = path->nodes[0];
	btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]);
	if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item))
		goto split;

	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
	btrfs_release_path(root, path);

	path->search_for_split = 1;
	path->keep_locks = 1;

	ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1);
	path->search_for_split = 0;

	/* if our item isn't there or got smaller, return now */
	if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0],
							path->slots[0])) {
		path->keep_locks = 0;
		return -EAGAIN;
	}

2812 2813
	ret = split_leaf(trans, root, &orig_key, path,
			 sizeof(struct btrfs_item), 1);
2814 2815 2816 2817
	path->keep_locks = 0;
	BUG_ON(ret);

	leaf = path->nodes[0];
2818
	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875

split:
	item = btrfs_item_nr(leaf, path->slots[0]);
	orig_offset = btrfs_item_offset(leaf, item);
	item_size = btrfs_item_size(leaf, item);


	buf = kmalloc(item_size, GFP_NOFS);
	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
			    path->slots[0]), item_size);
	slot = path->slots[0] + 1;
	leaf = path->nodes[0];

	nritems = btrfs_header_nritems(leaf);

	if (slot != nritems) {
		/* shift the items */
		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
			      btrfs_item_nr_offset(slot),
			      (nritems - slot) * sizeof(struct btrfs_item));

	}

	btrfs_cpu_key_to_disk(&disk_key, new_key);
	btrfs_set_item_key(leaf, &disk_key, slot);

	new_item = btrfs_item_nr(leaf, slot);

	btrfs_set_item_offset(leaf, new_item, orig_offset);
	btrfs_set_item_size(leaf, new_item, item_size - split_offset);

	btrfs_set_item_offset(leaf, item,
			      orig_offset + item_size - split_offset);
	btrfs_set_item_size(leaf, item, split_offset);

	btrfs_set_header_nritems(leaf, nritems + 1);

	/* write the data for the start of the original item */
	write_extent_buffer(leaf, buf,
			    btrfs_item_ptr_offset(leaf, path->slots[0]),
			    split_offset);

	/* write the data for the new item */
	write_extent_buffer(leaf, buf + split_offset,
			    btrfs_item_ptr_offset(leaf, slot),
			    item_size - split_offset);
	btrfs_mark_buffer_dirty(leaf);

	ret = 0;
	if (btrfs_leaf_free_space(root, leaf) < 0) {
		btrfs_print_leaf(root, leaf);
		BUG();
	}
	kfree(buf);
	return ret;
}

C
Chris Mason 已提交
2876 2877 2878 2879 2880 2881
/*
 * make the item pointed to by the path smaller.  new_size indicates
 * how small to make it, and from_end tells us if we just chop bytes
 * off the end of the item or if we shift the item to chop bytes off
 * the front.
 */
C
Chris Mason 已提交
2882 2883 2884
int btrfs_truncate_item(struct btrfs_trans_handle *trans,
			struct btrfs_root *root,
			struct btrfs_path *path,
2885
			u32 new_size, int from_end)
C
Chris Mason 已提交
2886 2887 2888 2889
{
	int ret = 0;
	int slot;
	int slot_orig;
2890 2891
	struct extent_buffer *leaf;
	struct btrfs_item *item;
C
Chris Mason 已提交
2892 2893 2894 2895 2896 2897 2898 2899
	u32 nritems;
	unsigned int data_end;
	unsigned int old_data_start;
	unsigned int old_size;
	unsigned int size_diff;
	int i;

	slot_orig = path->slots[0];
2900
	leaf = path->nodes[0];
2901 2902 2903 2904 2905
	slot = path->slots[0];

	old_size = btrfs_item_size_nr(leaf, slot);
	if (old_size == new_size)
		return 0;
C
Chris Mason 已提交
2906

2907
	nritems = btrfs_header_nritems(leaf);
C
Chris Mason 已提交
2908 2909
	data_end = leaf_data_end(root, leaf);

2910
	old_data_start = btrfs_item_offset_nr(leaf, slot);
2911

C
Chris Mason 已提交
2912 2913 2914 2915 2916 2917 2918 2919 2920 2921
	size_diff = old_size - new_size;

	BUG_ON(slot < 0);
	BUG_ON(slot >= nritems);

	/*
	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
	 */
	/* first correct the data pointers */
	for (i = slot; i < nritems; i++) {
2922 2923
		u32 ioff;
		item = btrfs_item_nr(leaf, i);
2924 2925 2926 2927 2928 2929 2930 2931 2932

		if (!leaf->map_token) {
			map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
		}

2933 2934
		ioff = btrfs_item_offset(leaf, item);
		btrfs_set_item_offset(leaf, item, ioff + size_diff);
C
Chris Mason 已提交
2935
	}
2936 2937 2938 2939 2940 2941

	if (leaf->map_token) {
		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
		leaf->map_token = NULL;
	}

C
Chris Mason 已提交
2942
	/* shift the data */
2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965
	if (from_end) {
		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
			      data_end + size_diff, btrfs_leaf_data(leaf) +
			      data_end, old_data_start + new_size - data_end);
	} else {
		struct btrfs_disk_key disk_key;
		u64 offset;

		btrfs_item_key(leaf, &disk_key, slot);

		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
			unsigned long ptr;
			struct btrfs_file_extent_item *fi;

			fi = btrfs_item_ptr(leaf, slot,
					    struct btrfs_file_extent_item);
			fi = (struct btrfs_file_extent_item *)(
			     (unsigned long)fi - size_diff);

			if (btrfs_file_extent_type(leaf, fi) ==
			    BTRFS_FILE_EXTENT_INLINE) {
				ptr = btrfs_item_ptr_offset(leaf, slot);
				memmove_extent_buffer(leaf, ptr,
C
Chris Mason 已提交
2966 2967
				      (unsigned long)fi,
				      offsetof(struct btrfs_file_extent_item,
2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981
						 disk_bytenr));
			}
		}

		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
			      data_end + size_diff, btrfs_leaf_data(leaf) +
			      data_end, old_data_start - data_end);

		offset = btrfs_disk_key_offset(&disk_key);
		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
		btrfs_set_item_key(leaf, &disk_key, slot);
		if (slot == 0)
			fixup_low_keys(trans, root, path, &disk_key, 1);
	}
2982 2983 2984 2985

	item = btrfs_item_nr(leaf, slot);
	btrfs_set_item_size(leaf, item, new_size);
	btrfs_mark_buffer_dirty(leaf);
C
Chris Mason 已提交
2986 2987

	ret = 0;
2988 2989
	if (btrfs_leaf_free_space(root, leaf) < 0) {
		btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
2990
		BUG();
2991
	}
C
Chris Mason 已提交
2992 2993 2994
	return ret;
}

C
Chris Mason 已提交
2995 2996 2997
/*
 * make the item pointed to by the path bigger, data_size is the new size.
 */
2998 2999 3000
int btrfs_extend_item(struct btrfs_trans_handle *trans,
		      struct btrfs_root *root, struct btrfs_path *path,
		      u32 data_size)
3001 3002 3003 3004
{
	int ret = 0;
	int slot;
	int slot_orig;
3005 3006
	struct extent_buffer *leaf;
	struct btrfs_item *item;
3007 3008 3009 3010 3011 3012 3013
	u32 nritems;
	unsigned int data_end;
	unsigned int old_data;
	unsigned int old_size;
	int i;

	slot_orig = path->slots[0];
3014
	leaf = path->nodes[0];
3015

3016
	nritems = btrfs_header_nritems(leaf);
3017 3018
	data_end = leaf_data_end(root, leaf);

3019 3020
	if (btrfs_leaf_free_space(root, leaf) < data_size) {
		btrfs_print_leaf(root, leaf);
3021
		BUG();
3022
	}
3023
	slot = path->slots[0];
3024
	old_data = btrfs_item_end_nr(leaf, slot);
3025 3026

	BUG_ON(slot < 0);
3027 3028
	if (slot >= nritems) {
		btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
3029 3030
		printk(KERN_CRIT "slot %d too large, nritems %d\n",
		       slot, nritems);
3031 3032
		BUG_ON(1);
	}
3033 3034 3035 3036 3037 3038

	/*
	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
	 */
	/* first correct the data pointers */
	for (i = slot; i < nritems; i++) {
3039 3040
		u32 ioff;
		item = btrfs_item_nr(leaf, i);
3041 3042 3043 3044 3045 3046 3047 3048

		if (!leaf->map_token) {
			map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
		}
3049 3050
		ioff = btrfs_item_offset(leaf, item);
		btrfs_set_item_offset(leaf, item, ioff - data_size);
3051
	}
3052

3053 3054 3055 3056 3057
	if (leaf->map_token) {
		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
		leaf->map_token = NULL;
	}

3058
	/* shift the data */
3059
	memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3060 3061
		      data_end - data_size, btrfs_leaf_data(leaf) +
		      data_end, old_data - data_end);
3062

3063
	data_end = old_data;
3064 3065 3066 3067
	old_size = btrfs_item_size_nr(leaf, slot);
	item = btrfs_item_nr(leaf, slot);
	btrfs_set_item_size(leaf, item, old_size + data_size);
	btrfs_mark_buffer_dirty(leaf);
3068 3069

	ret = 0;
3070 3071
	if (btrfs_leaf_free_space(root, leaf) < 0) {
		btrfs_print_leaf(root, leaf);
3072
		BUG();
3073
	}
3074 3075 3076
	return ret;
}

3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099
/*
 * Given a key and some data, insert items into the tree.
 * This does all the path init required, making room in the tree if needed.
 * Returns the number of keys that were inserted.
 */
int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root,
			    struct btrfs_path *path,
			    struct btrfs_key *cpu_key, u32 *data_size,
			    int nr)
{
	struct extent_buffer *leaf;
	struct btrfs_item *item;
	int ret = 0;
	int slot;
	int i;
	u32 nritems;
	u32 total_data = 0;
	u32 total_size = 0;
	unsigned int data_end;
	struct btrfs_disk_key disk_key;
	struct btrfs_key found_key;

3100 3101 3102 3103 3104 3105
	for (i = 0; i < nr; i++) {
		if (total_size + data_size[i] + sizeof(struct btrfs_item) >
		    BTRFS_LEAF_DATA_SIZE(root)) {
			break;
			nr = i;
		}
3106
		total_data += data_size[i];
3107 3108 3109
		total_size += data_size[i] + sizeof(struct btrfs_item);
	}
	BUG_ON(nr == 0);
3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151

	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
	if (ret == 0)
		return -EEXIST;
	if (ret < 0)
		goto out;

	leaf = path->nodes[0];

	nritems = btrfs_header_nritems(leaf);
	data_end = leaf_data_end(root, leaf);

	if (btrfs_leaf_free_space(root, leaf) < total_size) {
		for (i = nr; i >= 0; i--) {
			total_data -= data_size[i];
			total_size -= data_size[i] + sizeof(struct btrfs_item);
			if (total_size < btrfs_leaf_free_space(root, leaf))
				break;
		}
		nr = i;
	}

	slot = path->slots[0];
	BUG_ON(slot < 0);

	if (slot != nritems) {
		unsigned int old_data = btrfs_item_end_nr(leaf, slot);

		item = btrfs_item_nr(leaf, slot);
		btrfs_item_key_to_cpu(leaf, &found_key, slot);

		/* figure out how many keys we can insert in here */
		total_data = data_size[0];
		for (i = 1; i < nr; i++) {
			if (comp_cpu_keys(&found_key, cpu_key + i) <= 0)
				break;
			total_data += data_size[i];
		}
		nr = i;

		if (old_data < data_end) {
			btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
3152
			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228
			       slot, old_data, data_end);
			BUG_ON(1);
		}
		/*
		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
		 */
		/* first correct the data pointers */
		WARN_ON(leaf->map_token);
		for (i = slot; i < nritems; i++) {
			u32 ioff;

			item = btrfs_item_nr(leaf, i);
			if (!leaf->map_token) {
				map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
			}

			ioff = btrfs_item_offset(leaf, item);
			btrfs_set_item_offset(leaf, item, ioff - total_data);
		}
		if (leaf->map_token) {
			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
			leaf->map_token = NULL;
		}

		/* shift the items */
		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
			      btrfs_item_nr_offset(slot),
			      (nritems - slot) * sizeof(struct btrfs_item));

		/* shift the data */
		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
			      data_end - total_data, btrfs_leaf_data(leaf) +
			      data_end, old_data - data_end);
		data_end = old_data;
	} else {
		/*
		 * this sucks but it has to be done, if we are inserting at
		 * the end of the leaf only insert 1 of the items, since we
		 * have no way of knowing whats on the next leaf and we'd have
		 * to drop our current locks to figure it out
		 */
		nr = 1;
	}

	/* setup the item for the new data */
	for (i = 0; i < nr; i++) {
		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
		btrfs_set_item_key(leaf, &disk_key, slot + i);
		item = btrfs_item_nr(leaf, slot + i);
		btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
		data_end -= data_size[i];
		btrfs_set_item_size(leaf, item, data_size[i]);
	}
	btrfs_set_header_nritems(leaf, nritems + nr);
	btrfs_mark_buffer_dirty(leaf);

	ret = 0;
	if (slot == 0) {
		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
	}

	if (btrfs_leaf_free_space(root, leaf) < 0) {
		btrfs_print_leaf(root, leaf);
		BUG();
	}
out:
	if (!ret)
		ret = nr;
	return ret;
}

C
Chris Mason 已提交
3229
/*
C
Chris Mason 已提交
3230
 * Given a key and some data, insert items into the tree.
C
Chris Mason 已提交
3231 3232
 * This does all the path init required, making room in the tree if needed.
 */
3233
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3234 3235
			    struct btrfs_root *root,
			    struct btrfs_path *path,
3236 3237
			    struct btrfs_key *cpu_key, u32 *data_size,
			    int nr)
3238
{
3239 3240
	struct extent_buffer *leaf;
	struct btrfs_item *item;
C
Chris Mason 已提交
3241
	int ret = 0;
3242
	int slot;
3243
	int slot_orig;
3244
	int i;
3245
	u32 nritems;
3246 3247
	u32 total_size = 0;
	u32 total_data = 0;
3248
	unsigned int data_end;
C
Chris Mason 已提交
3249 3250
	struct btrfs_disk_key disk_key;

C
Chris Mason 已提交
3251
	for (i = 0; i < nr; i++)
3252
		total_data += data_size[i];
3253

3254
	total_size = total_data + (nr * sizeof(struct btrfs_item));
3255
	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
J
Josef Bacik 已提交
3256
	if (ret == 0)
3257
		return -EEXIST;
3258 3259
	if (ret < 0)
		goto out;
3260

3261
	slot_orig = path->slots[0];
3262
	leaf = path->nodes[0];
C
Chris Mason 已提交
3263

3264
	nritems = btrfs_header_nritems(leaf);
C
Chris Mason 已提交
3265
	data_end = leaf_data_end(root, leaf);
3266

3267
	if (btrfs_leaf_free_space(root, leaf) < total_size) {
3268
		btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
3269
		printk(KERN_CRIT "not enough freespace need %u have %d\n",
3270
		       total_size, btrfs_leaf_free_space(root, leaf));
3271
		BUG();
3272
	}
3273

3274
	slot = path->slots[0];
3275
	BUG_ON(slot < 0);
3276

3277
	if (slot != nritems) {
3278
		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3279

3280 3281
		if (old_data < data_end) {
			btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
3282
			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3283 3284 3285
			       slot, old_data, data_end);
			BUG_ON(1);
		}
3286 3287 3288 3289
		/*
		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
		 */
		/* first correct the data pointers */
3290
		WARN_ON(leaf->map_token);
C
Chris Mason 已提交
3291
		for (i = slot; i < nritems; i++) {
3292
			u32 ioff;
3293

3294
			item = btrfs_item_nr(leaf, i);
3295 3296 3297 3298 3299 3300 3301 3302
			if (!leaf->map_token) {
				map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
			}

3303
			ioff = btrfs_item_offset(leaf, item);
3304
			btrfs_set_item_offset(leaf, item, ioff - total_data);
C
Chris Mason 已提交
3305
		}
3306 3307 3308 3309
		if (leaf->map_token) {
			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
			leaf->map_token = NULL;
		}
3310 3311

		/* shift the items */
3312
		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3313
			      btrfs_item_nr_offset(slot),
C
Chris Mason 已提交
3314
			      (nritems - slot) * sizeof(struct btrfs_item));
3315 3316

		/* shift the data */
3317
		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3318
			      data_end - total_data, btrfs_leaf_data(leaf) +
C
Chris Mason 已提交
3319
			      data_end, old_data - data_end);
3320 3321
		data_end = old_data;
	}
3322

3323
	/* setup the item for the new data */
3324 3325 3326 3327 3328 3329 3330 3331 3332
	for (i = 0; i < nr; i++) {
		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
		btrfs_set_item_key(leaf, &disk_key, slot + i);
		item = btrfs_item_nr(leaf, slot + i);
		btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
		data_end -= data_size[i];
		btrfs_set_item_size(leaf, item, data_size[i]);
	}
	btrfs_set_header_nritems(leaf, nritems + nr);
3333
	btrfs_mark_buffer_dirty(leaf);
C
Chris Mason 已提交
3334 3335

	ret = 0;
3336 3337
	if (slot == 0) {
		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3338
		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3339
	}
C
Chris Mason 已提交
3340

3341 3342
	if (btrfs_leaf_free_space(root, leaf) < 0) {
		btrfs_print_leaf(root, leaf);
3343
		BUG();
3344
	}
3345
out:
3346 3347 3348 3349 3350 3351 3352
	return ret;
}

/*
 * Given a key and some data, insert an item into the tree.
 * This does all the path init required, making room in the tree if needed.
 */
3353 3354 3355
int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
		      *root, struct btrfs_key *cpu_key, void *data, u32
		      data_size)
3356 3357
{
	int ret = 0;
C
Chris Mason 已提交
3358
	struct btrfs_path *path;
3359 3360
	struct extent_buffer *leaf;
	unsigned long ptr;
3361

C
Chris Mason 已提交
3362 3363 3364
	path = btrfs_alloc_path();
	BUG_ON(!path);
	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3365
	if (!ret) {
3366 3367 3368 3369
		leaf = path->nodes[0];
		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
		write_extent_buffer(leaf, data, ptr, data_size);
		btrfs_mark_buffer_dirty(leaf);
3370
	}
C
Chris Mason 已提交
3371
	btrfs_free_path(path);
C
Chris Mason 已提交
3372
	return ret;
3373 3374
}

C
Chris Mason 已提交
3375
/*
C
Chris Mason 已提交
3376
 * delete the pointer from a given node.
C
Chris Mason 已提交
3377
 *
C
Chris Mason 已提交
3378 3379
 * the tree should have been previously balanced so the deletion does not
 * empty a node.
C
Chris Mason 已提交
3380
 */
3381 3382
static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		   struct btrfs_path *path, int level, int slot)
3383
{
3384
	struct extent_buffer *parent = path->nodes[level];
3385
	u32 nritems;
C
Chris Mason 已提交
3386
	int ret = 0;
3387
	int wret;
3388

3389
	nritems = btrfs_header_nritems(parent);
C
Chris Mason 已提交
3390
	if (slot != nritems - 1) {
3391 3392 3393
		memmove_extent_buffer(parent,
			      btrfs_node_key_ptr_offset(slot),
			      btrfs_node_key_ptr_offset(slot + 1),
C
Chris Mason 已提交
3394 3395
			      sizeof(struct btrfs_key_ptr) *
			      (nritems - slot - 1));
3396
	}
3397
	nritems--;
3398
	btrfs_set_header_nritems(parent, nritems);
3399
	if (nritems == 0 && parent == root->node) {
3400
		BUG_ON(btrfs_header_level(root->node) != 1);
3401
		/* just turn the root into a leaf and break */
3402
		btrfs_set_header_level(root->node, 0);
3403
	} else if (slot == 0) {
3404 3405 3406 3407
		struct btrfs_disk_key disk_key;

		btrfs_node_key(parent, &disk_key, 0);
		wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
C
Chris Mason 已提交
3408 3409
		if (wret)
			ret = wret;
3410
	}
C
Chris Mason 已提交
3411
	btrfs_mark_buffer_dirty(parent);
C
Chris Mason 已提交
3412
	return ret;
3413 3414
}

3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441
/*
 * a helper function to delete the leaf pointed to by path->slots[1] and
 * path->nodes[1].  bytenr is the node block pointer, but since the callers
 * already know it, it is faster to have them pass it down than to
 * read it out of the node again.
 *
 * This deletes the pointer in path->nodes[1] and frees the leaf
 * block extent.  zero is returned if it all worked out, < 0 otherwise.
 *
 * The path must have already been setup for deleting the leaf, including
 * all the proper balancing.  path->nodes[1] must be locked.
 */
noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root,
			    struct btrfs_path *path, u64 bytenr)
{
	int ret;
	u64 root_gen = btrfs_header_generation(path->nodes[1]);

	ret = del_ptr(trans, root, path, 1, path->slots[1]);
	if (ret)
		return ret;

	ret = btrfs_free_extent(trans, root, bytenr,
				btrfs_level_size(root, 0),
				path->nodes[1]->start,
				btrfs_header_owner(path->nodes[1]),
3442
				root_gen, 0, 1);
3443 3444
	return ret;
}
C
Chris Mason 已提交
3445 3446 3447 3448
/*
 * delete the item at the leaf level in path.  If that empties
 * the leaf, remove it from the tree
 */
3449 3450
int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		    struct btrfs_path *path, int slot, int nr)
3451
{
3452 3453
	struct extent_buffer *leaf;
	struct btrfs_item *item;
3454 3455
	int last_off;
	int dsize = 0;
C
Chris Mason 已提交
3456 3457
	int ret = 0;
	int wret;
3458
	int i;
3459
	u32 nritems;
3460

3461
	leaf = path->nodes[0];
3462 3463 3464 3465 3466
	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);

	for (i = 0; i < nr; i++)
		dsize += btrfs_item_size_nr(leaf, slot + i);

3467
	nritems = btrfs_header_nritems(leaf);
3468

3469
	if (slot + nr != nritems) {
C
Chris Mason 已提交
3470
		int data_end = leaf_data_end(root, leaf);
3471 3472

		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
C
Chris Mason 已提交
3473 3474
			      data_end + dsize,
			      btrfs_leaf_data(leaf) + data_end,
3475
			      last_off - data_end);
3476

3477
		for (i = slot + nr; i < nritems; i++) {
3478
			u32 ioff;
3479

3480
			item = btrfs_item_nr(leaf, i);
3481 3482 3483 3484 3485 3486 3487
			if (!leaf->map_token) {
				map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
			}
3488 3489
			ioff = btrfs_item_offset(leaf, item);
			btrfs_set_item_offset(leaf, item, ioff + dsize);
C
Chris Mason 已提交
3490
		}
3491 3492 3493 3494 3495 3496

		if (leaf->map_token) {
			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
			leaf->map_token = NULL;
		}

3497
		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
3498
			      btrfs_item_nr_offset(slot + nr),
C
Chris Mason 已提交
3499
			      sizeof(struct btrfs_item) *
3500
			      (nritems - slot - nr));
3501
	}
3502 3503
	btrfs_set_header_nritems(leaf, nritems - nr);
	nritems -= nr;
3504

C
Chris Mason 已提交
3505
	/* delete the leaf if we've emptied it */
3506
	if (nritems == 0) {
3507 3508
		if (leaf == root->node) {
			btrfs_set_header_level(leaf, 0);
3509
		} else {
3510 3511
			ret = btrfs_del_leaf(trans, root, path, leaf->start);
			BUG_ON(ret);
3512
		}
3513
	} else {
3514
		int used = leaf_space_used(leaf, 0, nritems);
C
Chris Mason 已提交
3515
		if (slot == 0) {
3516 3517 3518
			struct btrfs_disk_key disk_key;

			btrfs_item_key(leaf, &disk_key, 0);
3519
			wret = fixup_low_keys(trans, root, path,
3520
					      &disk_key, 1);
C
Chris Mason 已提交
3521 3522 3523 3524
			if (wret)
				ret = wret;
		}

C
Chris Mason 已提交
3525
		/* delete the leaf if it is mostly empty */
3526
		if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
3527 3528 3529 3530
			/* push_leaf_left fixes the path.
			 * make sure the path still points to our leaf
			 * for possible call to del_ptr below
			 */
3531
			slot = path->slots[1];
3532 3533
			extent_buffer_get(leaf);

3534
			wret = push_leaf_left(trans, root, path, 1, 1);
3535
			if (wret < 0 && wret != -ENOSPC)
C
Chris Mason 已提交
3536
				ret = wret;
3537 3538 3539

			if (path->nodes[0] == leaf &&
			    btrfs_header_nritems(leaf)) {
3540
				wret = push_leaf_right(trans, root, path, 1, 1);
3541
				if (wret < 0 && wret != -ENOSPC)
C
Chris Mason 已提交
3542 3543
					ret = wret;
			}
3544 3545

			if (btrfs_header_nritems(leaf) == 0) {
3546
				path->slots[1] = slot;
C
Chris Mason 已提交
3547 3548
				ret = btrfs_del_leaf(trans, root, path,
						     leaf->start);
3549
				BUG_ON(ret);
3550
				free_extent_buffer(leaf);
C
Chris Mason 已提交
3551
			} else {
3552 3553 3554 3555 3556 3557 3558
				/* if we're still in the path, make sure
				 * we're dirty.  Otherwise, one of the
				 * push_leaf functions must have already
				 * dirtied this buffer
				 */
				if (path->nodes[0] == leaf)
					btrfs_mark_buffer_dirty(leaf);
3559
				free_extent_buffer(leaf);
3560
			}
3561
		} else {
3562
			btrfs_mark_buffer_dirty(leaf);
3563 3564
		}
	}
C
Chris Mason 已提交
3565
	return ret;
3566 3567
}

3568
/*
3569
 * search the tree again to find a leaf with lesser keys
3570 3571
 * returns 0 if it found something or 1 if there are no lesser leaves.
 * returns < 0 on io errors.
C
Chris Mason 已提交
3572 3573 3574
 *
 * This may release the path, and so you may lose any locks held at the
 * time you call it.
3575 3576 3577
 */
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
{
3578 3579 3580
	struct btrfs_key key;
	struct btrfs_disk_key found_key;
	int ret;
3581

3582
	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
3583

3584 3585 3586 3587 3588 3589 3590 3591
	if (key.offset > 0)
		key.offset--;
	else if (key.type > 0)
		key.type--;
	else if (key.objectid > 0)
		key.objectid--;
	else
		return 1;
3592

3593 3594 3595 3596 3597 3598 3599 3600 3601
	btrfs_release_path(root, path);
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0)
		return ret;
	btrfs_item_key(path->nodes[0], &found_key, 0);
	ret = comp_keys(&found_key, &key);
	if (ret < 0)
		return 0;
	return 1;
3602 3603
}

3604 3605 3606
/*
 * A helper function to walk down the tree starting at min_key, and looking
 * for nodes or leaves that are either in cache or have a minimum
C
Chris Mason 已提交
3607
 * transaction id.  This is used by the btree defrag code, and tree logging
3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618
 *
 * This does not cow, but it does stuff the starting key it finds back
 * into min_key, so you can call btrfs_search_slot with cow=1 on the
 * key and get a writable path.
 *
 * This does lock as it descends, and path->keep_locks should be set
 * to 1 by the caller.
 *
 * This honors path->lowest_level to prevent descent past a given level
 * of the tree.
 *
C
Chris Mason 已提交
3619 3620 3621 3622
 * min_trans indicates the oldest transaction that you are interested
 * in walking through.  Any nodes or leaves older than min_trans are
 * skipped over (without reading them).
 *
3623 3624 3625 3626
 * returns zero if something useful was found, < 0 on error and 1 if there
 * was nothing in the tree that matched the search criteria.
 */
int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
3627
			 struct btrfs_key *max_key,
3628 3629 3630 3631 3632 3633
			 struct btrfs_path *path, int cache_only,
			 u64 min_trans)
{
	struct extent_buffer *cur;
	struct btrfs_key found_key;
	int slot;
3634
	int sret;
3635 3636 3637 3638
	u32 nritems;
	int level;
	int ret = 1;

3639
	WARN_ON(!path->keep_locks);
3640 3641 3642
again:
	cur = btrfs_lock_root_node(root);
	level = btrfs_header_level(cur);
3643
	WARN_ON(path->nodes[level]);
3644 3645 3646 3647 3648 3649 3650
	path->nodes[level] = cur;
	path->locks[level] = 1;

	if (btrfs_header_generation(cur) < min_trans) {
		ret = 1;
		goto out;
	}
C
Chris Mason 已提交
3651
	while (1) {
3652 3653
		nritems = btrfs_header_nritems(cur);
		level = btrfs_header_level(cur);
3654
		sret = bin_search(cur, min_key, level, &slot);
3655

3656 3657
		/* at the lowest level, we're done, setup the path and exit */
		if (level == path->lowest_level) {
3658 3659
			if (slot >= nritems)
				goto find_next_key;
3660 3661 3662 3663 3664
			ret = 0;
			path->slots[level] = slot;
			btrfs_item_key_to_cpu(cur, &found_key, slot);
			goto out;
		}
3665 3666
		if (sret && slot > 0)
			slot--;
3667 3668 3669 3670 3671
		/*
		 * check this node pointer against the cache_only and
		 * min_trans parameters.  If it isn't in cache or is too
		 * old, skip to the next one.
		 */
C
Chris Mason 已提交
3672
		while (slot < nritems) {
3673 3674 3675
			u64 blockptr;
			u64 gen;
			struct extent_buffer *tmp;
3676 3677
			struct btrfs_disk_key disk_key;

3678 3679 3680 3681 3682 3683 3684 3685 3686
			blockptr = btrfs_node_blockptr(cur, slot);
			gen = btrfs_node_ptr_generation(cur, slot);
			if (gen < min_trans) {
				slot++;
				continue;
			}
			if (!cache_only)
				break;

3687 3688 3689 3690 3691 3692 3693 3694
			if (max_key) {
				btrfs_node_key(cur, &disk_key, slot);
				if (comp_keys(&disk_key, max_key) >= 0) {
					ret = 1;
					goto out;
				}
			}

3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705
			tmp = btrfs_find_tree_block(root, blockptr,
					    btrfs_level_size(root, level - 1));

			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
				free_extent_buffer(tmp);
				break;
			}
			if (tmp)
				free_extent_buffer(tmp);
			slot++;
		}
3706
find_next_key:
3707 3708 3709 3710 3711
		/*
		 * we didn't find a candidate key in this node, walk forward
		 * and find another one
		 */
		if (slot >= nritems) {
3712 3713
			path->slots[level] = slot;
			sret = btrfs_find_next_key(root, path, min_key, level,
3714
						  cache_only, min_trans);
3715
			if (sret == 0) {
3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754
				btrfs_release_path(root, path);
				goto again;
			} else {
				goto out;
			}
		}
		/* save our key for returning back */
		btrfs_node_key_to_cpu(cur, &found_key, slot);
		path->slots[level] = slot;
		if (level == path->lowest_level) {
			ret = 0;
			unlock_up(path, level, 1);
			goto out;
		}
		cur = read_node_slot(root, cur, slot);

		btrfs_tree_lock(cur);
		path->locks[level - 1] = 1;
		path->nodes[level - 1] = cur;
		unlock_up(path, level, 1);
	}
out:
	if (ret == 0)
		memcpy(min_key, &found_key, sizeof(found_key));
	return ret;
}

/*
 * this is similar to btrfs_next_leaf, but does not try to preserve
 * and fixup the path.  It looks for and returns the next key in the
 * tree based on the current path and the cache_only and min_trans
 * parameters.
 *
 * 0 is returned if another key is found, < 0 if there are any errors
 * and 1 is returned if there are no higher keys in the tree
 *
 * path->keep_locks should be set to 1 on the search made before
 * calling this function.
 */
3755
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
3756 3757
			struct btrfs_key *key, int lowest_level,
			int cache_only, u64 min_trans)
3758 3759 3760 3761 3762
{
	int level = lowest_level;
	int slot;
	struct extent_buffer *c;

3763
	WARN_ON(!path->keep_locks);
C
Chris Mason 已提交
3764
	while (level < BTRFS_MAX_LEVEL) {
3765 3766 3767 3768 3769
		if (!path->nodes[level])
			return 1;

		slot = path->slots[level] + 1;
		c = path->nodes[level];
3770
next:
3771 3772
		if (slot >= btrfs_header_nritems(c)) {
			level++;
C
Chris Mason 已提交
3773
			if (level == BTRFS_MAX_LEVEL)
3774 3775 3776 3777 3778
				return 1;
			continue;
		}
		if (level == 0)
			btrfs_item_key_to_cpu(c, key, slot);
3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798
		else {
			u64 blockptr = btrfs_node_blockptr(c, slot);
			u64 gen = btrfs_node_ptr_generation(c, slot);

			if (cache_only) {
				struct extent_buffer *cur;
				cur = btrfs_find_tree_block(root, blockptr,
					    btrfs_level_size(root, level - 1));
				if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
					slot++;
					if (cur)
						free_extent_buffer(cur);
					goto next;
				}
				free_extent_buffer(cur);
			}
			if (gen < min_trans) {
				slot++;
				goto next;
			}
3799
			btrfs_node_key_to_cpu(c, key, slot);
3800
		}
3801 3802 3803 3804 3805
		return 0;
	}
	return 1;
}

C
Chris Mason 已提交
3806
/*
3807
 * search the tree again to find a leaf with greater keys
C
Chris Mason 已提交
3808 3809
 * returns 0 if it found something or 1 if there are no greater leaves.
 * returns < 0 on io errors.
C
Chris Mason 已提交
3810
 */
C
Chris Mason 已提交
3811
int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3812 3813 3814
{
	int slot;
	int level = 1;
3815 3816
	struct extent_buffer *c;
	struct extent_buffer *next = NULL;
3817 3818 3819 3820 3821
	struct btrfs_key key;
	u32 nritems;
	int ret;

	nritems = btrfs_header_nritems(path->nodes[0]);
C
Chris Mason 已提交
3822
	if (nritems == 0)
3823 3824 3825 3826 3827
		return 1;

	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);

	btrfs_release_path(root, path);
3828
	path->keep_locks = 1;
3829 3830 3831 3832 3833 3834
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	path->keep_locks = 0;

	if (ret < 0)
		return ret;

3835
	nritems = btrfs_header_nritems(path->nodes[0]);
3836 3837 3838 3839 3840 3841
	/*
	 * by releasing the path above we dropped all our locks.  A balance
	 * could have added more items next to the key that used to be
	 * at the very end of the block.  So, check again here and
	 * advance the path if there are now more items available.
	 */
3842
	if (nritems > 0 && path->slots[0] < nritems - 1) {
3843
		path->slots[0]++;
3844 3845
		goto done;
	}
3846

C
Chris Mason 已提交
3847
	while (level < BTRFS_MAX_LEVEL) {
3848
		if (!path->nodes[level])
C
Chris Mason 已提交
3849
			return 1;
3850

3851 3852
		slot = path->slots[level] + 1;
		c = path->nodes[level];
3853
		if (slot >= btrfs_header_nritems(c)) {
3854
			level++;
C
Chris Mason 已提交
3855
			if (level == BTRFS_MAX_LEVEL)
3856
				return 1;
3857 3858
			continue;
		}
3859

3860 3861
		if (next) {
			btrfs_tree_unlock(next);
3862
			free_extent_buffer(next);
3863
		}
3864

3865 3866
		if (level == 1 && (path->locks[1] || path->skip_locking) &&
		    path->reada)
3867
			reada_for_search(root, path, level, slot, 0);
3868

3869
		next = read_node_slot(root, c, slot);
3870 3871 3872 3873
		if (!path->skip_locking) {
			WARN_ON(!btrfs_tree_locked(c));
			btrfs_tree_lock(next);
		}
3874 3875 3876
		break;
	}
	path->slots[level] = slot;
C
Chris Mason 已提交
3877
	while (1) {
3878 3879
		level--;
		c = path->nodes[level];
3880 3881
		if (path->locks[level])
			btrfs_tree_unlock(c);
3882
		free_extent_buffer(c);
3883 3884
		path->nodes[level] = next;
		path->slots[level] = 0;
3885 3886
		if (!path->skip_locking)
			path->locks[level] = 1;
3887 3888
		if (!level)
			break;
3889 3890
		if (level == 1 && path->locks[1] && path->reada)
			reada_for_search(root, path, level, slot, 0);
3891
		next = read_node_slot(root, next, 0);
3892 3893 3894 3895
		if (!path->skip_locking) {
			WARN_ON(!btrfs_tree_locked(path->nodes[level]));
			btrfs_tree_lock(next);
		}
3896
	}
3897 3898
done:
	unlock_up(path, 0, 1);
3899 3900
	return 0;
}
3901

3902 3903 3904 3905 3906 3907
/*
 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
 * searching until it gets past min_objectid or finds an item of 'type'
 *
 * returns 0 if something is found, 1 if nothing was found and < 0 on error
 */
3908 3909 3910 3911 3912 3913
int btrfs_previous_item(struct btrfs_root *root,
			struct btrfs_path *path, u64 min_objectid,
			int type)
{
	struct btrfs_key found_key;
	struct extent_buffer *leaf;
3914
	u32 nritems;
3915 3916
	int ret;

C
Chris Mason 已提交
3917
	while (1) {
3918 3919 3920 3921 3922 3923 3924 3925
		if (path->slots[0] == 0) {
			ret = btrfs_prev_leaf(root, path);
			if (ret != 0)
				return ret;
		} else {
			path->slots[0]--;
		}
		leaf = path->nodes[0];
3926 3927 3928 3929 3930 3931
		nritems = btrfs_header_nritems(leaf);
		if (nritems == 0)
			return 1;
		if (path->slots[0] == nritems)
			path->slots[0]--;

3932 3933 3934
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
		if (found_key.type == type)
			return 0;
3935 3936 3937 3938 3939
		if (found_key.objectid < min_objectid)
			break;
		if (found_key.objectid == min_objectid &&
		    found_key.type < type)
			break;
3940 3941 3942
	}
	return 1;
}