ctree.c 101.6 KB
Newer Older
C
Chris Mason 已提交
1
/*
C
Chris Mason 已提交
2
 * Copyright (C) 2007,2008 Oracle.  All rights reserved.
C
Chris Mason 已提交
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

19
#include <linux/sched.h>
20 21
#include "ctree.h"
#include "disk-io.h"
22
#include "transaction.h"
23
#include "print-tree.h"
24
#include "locking.h"
25

26 27 28
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
		      *root, struct btrfs_path *path, int level);
static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
29
		      *root, struct btrfs_key *ins_key,
30
		      struct btrfs_path *path, int data_size, int extend);
31 32
static int push_node_left(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root, struct extent_buffer *dst,
33
			  struct extent_buffer *src, int empty);
34 35 36 37
static int balance_node_right(struct btrfs_trans_handle *trans,
			      struct btrfs_root *root,
			      struct extent_buffer *dst_buf,
			      struct extent_buffer *src_buf);
38 39
static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		   struct btrfs_path *path, int level, int slot);
40

C
Chris Mason 已提交
41
inline void btrfs_init_path(struct btrfs_path *p)
C
Chris Mason 已提交
42
{
C
Chris Mason 已提交
43
	memset(p, 0, sizeof(*p));
C
Chris Mason 已提交
44 45
}

C
Chris Mason 已提交
46
struct btrfs_path *btrfs_alloc_path(void)
C
Chris Mason 已提交
47
{
C
Chris Mason 已提交
48 49
	struct btrfs_path *path;
	path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
50
	if (path) {
C
Chris Mason 已提交
51
		btrfs_init_path(path);
52 53
		path->reada = 1;
	}
C
Chris Mason 已提交
54
	return path;
C
Chris Mason 已提交
55 56
}

C
Chris Mason 已提交
57
/* this also releases the path */
C
Chris Mason 已提交
58
void btrfs_free_path(struct btrfs_path *p)
59
{
C
Chris Mason 已提交
60 61
	btrfs_release_path(NULL, p);
	kmem_cache_free(btrfs_path_cachep, p);
62 63
}

C
Chris Mason 已提交
64 65 66 67 68 69
/*
 * path release drops references on the extent buffers in the path
 * and it drops any locks held by this path
 *
 * It is safe to call this on paths that no locks or extent buffers held.
 */
C
Chris Mason 已提交
70
noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
71 72
{
	int i;
73

C
Chris Mason 已提交
74
	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
75
		p->slots[i] = 0;
76
		if (!p->nodes[i])
77 78 79 80 81
			continue;
		if (p->locks[i]) {
			btrfs_tree_unlock(p->nodes[i]);
			p->locks[i] = 0;
		}
82
		free_extent_buffer(p->nodes[i]);
83
		p->nodes[i] = NULL;
84 85 86
	}
}

C
Chris Mason 已提交
87 88 89 90 91 92 93 94 95 96
/*
 * safely gets a reference on the root node of a tree.  A lock
 * is not taken, so a concurrent writer may put a different node
 * at the root of the tree.  See btrfs_lock_root_node for the
 * looping required.
 *
 * The extent buffer returned by this has a reference taken, so
 * it won't disappear.  It may stop being the root of the tree
 * at any time because there are no locks held.
 */
97 98 99 100 101 102 103 104 105 106
struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
{
	struct extent_buffer *eb;
	spin_lock(&root->node_lock);
	eb = root->node;
	extent_buffer_get(eb);
	spin_unlock(&root->node_lock);
	return eb;
}

C
Chris Mason 已提交
107 108 109 110
/* loop around taking references on and locking the root node of the
 * tree until you end up with a lock on the root.  A locked buffer
 * is returned, with a reference held.
 */
111 112 113 114
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
{
	struct extent_buffer *eb;

C
Chris Mason 已提交
115
	while (1) {
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
		eb = btrfs_root_node(root);
		btrfs_tree_lock(eb);

		spin_lock(&root->node_lock);
		if (eb == root->node) {
			spin_unlock(&root->node_lock);
			break;
		}
		spin_unlock(&root->node_lock);

		btrfs_tree_unlock(eb);
		free_extent_buffer(eb);
	}
	return eb;
}

C
Chris Mason 已提交
132 133 134 135
/* cowonly root (everything not a reference counted cow subvolume), just get
 * put onto a simple dirty list.  transaction.c walks this to make sure they
 * get properly updated on disk.
 */
136 137 138 139 140 141 142 143
static void add_root_to_dirty_list(struct btrfs_root *root)
{
	if (root->track_dirty && list_empty(&root->dirty_list)) {
		list_add(&root->dirty_list,
			 &root->fs_info->dirty_cowonly_roots);
	}
}

C
Chris Mason 已提交
144 145 146 147 148
/*
 * used by snapshot creation to make a copy of a root for a tree with
 * a given objectid.  The buffer with the new root node is returned in
 * cow_ret, and this func returns zero on success or a negative error code.
 */
149 150 151 152 153 154 155 156 157
int btrfs_copy_root(struct btrfs_trans_handle *trans,
		      struct btrfs_root *root,
		      struct extent_buffer *buf,
		      struct extent_buffer **cow_ret, u64 new_root_objectid)
{
	struct extent_buffer *cow;
	u32 nritems;
	int ret = 0;
	int level;
158
	struct btrfs_root *new_root;
159

160 161 162 163 164 165
	new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
	if (!new_root)
		return -ENOMEM;

	memcpy(new_root, root, sizeof(*new_root));
	new_root->root_key.objectid = new_root_objectid;
166 167 168 169 170 171 172

	WARN_ON(root->ref_cows && trans->transid !=
		root->fs_info->running_transaction->transid);
	WARN_ON(root->ref_cows && trans->transid != root->last_trans);

	level = btrfs_header_level(buf);
	nritems = btrfs_header_nritems(buf);
Z
Zheng Yan 已提交
173 174 175 176

	cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
				     new_root_objectid, trans->transid,
				     level, buf->start, 0);
177 178
	if (IS_ERR(cow)) {
		kfree(new_root);
179
		return PTR_ERR(cow);
180
	}
181 182 183 184 185

	copy_extent_buffer(cow, buf, 0, 0, cow->len);
	btrfs_set_header_bytenr(cow, cow->start);
	btrfs_set_header_generation(cow, trans->transid);
	btrfs_set_header_owner(cow, new_root_objectid);
186
	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
187

Y
Yan Zheng 已提交
188 189 190 191
	write_extent_buffer(cow, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(cow),
			    BTRFS_FSID_SIZE);

192
	WARN_ON(btrfs_header_generation(buf) > trans->transid);
Z
Zheng Yan 已提交
193
	ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
194 195
	kfree(new_root);

196 197 198 199 200 201 202 203
	if (ret)
		return ret;

	btrfs_mark_buffer_dirty(cow);
	*cow_ret = cow;
	return 0;
}

C
Chris Mason 已提交
204
/*
C
Chris Mason 已提交
205 206 207 208
 * does the dirty work in cow of a single block.  The parent block (if
 * supplied) is updated to point to the new cow copy.  The new buffer is marked
 * dirty and returned locked.  If you modify the block it needs to be marked
 * dirty again.
C
Chris Mason 已提交
209 210 211
 *
 * search_start -- an allocation hint for the new block
 *
C
Chris Mason 已提交
212 213 214
 * empty_size -- a hint that you plan on doing more cow.  This is the size in
 * bytes the allocator should try to find free next to the block it returns.
 * This is just a hint and may be ignored by the allocator.
C
Chris Mason 已提交
215 216
 *
 * prealloc_dest -- if you have already reserved a destination for the cow,
C
Chris Mason 已提交
217 218
 * this uses that block instead of allocating a new one.
 * btrfs_alloc_reserved_extent is used to finish the allocation.
C
Chris Mason 已提交
219
 */
C
Chris Mason 已提交
220
static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
221 222 223 224
			     struct btrfs_root *root,
			     struct extent_buffer *buf,
			     struct extent_buffer *parent, int parent_slot,
			     struct extent_buffer **cow_ret,
225 226
			     u64 search_start, u64 empty_size,
			     u64 prealloc_dest)
C
Chris Mason 已提交
227
{
Z
Zheng Yan 已提交
228
	u64 parent_start;
229
	struct extent_buffer *cow;
230
	u32 nritems;
231
	int ret = 0;
232
	int level;
233
	int unlock_orig = 0;
234

235 236 237 238 239
	if (*cow_ret == buf)
		unlock_orig = 1;

	WARN_ON(!btrfs_tree_locked(buf));

Z
Zheng Yan 已提交
240 241 242 243 244
	if (parent)
		parent_start = parent->start;
	else
		parent_start = 0;

245 246
	WARN_ON(root->ref_cows && trans->transid !=
		root->fs_info->running_transaction->transid);
247
	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
248

249 250
	level = btrfs_header_level(buf);
	nritems = btrfs_header_nritems(buf);
Z
Zheng Yan 已提交
251

252 253 254 255 256 257 258
	if (prealloc_dest) {
		struct btrfs_key ins;

		ins.objectid = prealloc_dest;
		ins.offset = buf->len;
		ins.type = BTRFS_EXTENT_ITEM_KEY;

Z
Zheng Yan 已提交
259
		ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
260
						  root->root_key.objectid,
261
						  trans->transid, level, &ins);
262 263 264 265 266
		BUG_ON(ret);
		cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
					    buf->len);
	} else {
		cow = btrfs_alloc_free_block(trans, root, buf->len,
Z
Zheng Yan 已提交
267
					     parent_start,
268
					     root->root_key.objectid,
Z
Zheng Yan 已提交
269 270
					     trans->transid, level,
					     search_start, empty_size);
271
	}
272 273
	if (IS_ERR(cow))
		return PTR_ERR(cow);
274

275
	copy_extent_buffer(cow, buf, 0, 0, cow->len);
276
	btrfs_set_header_bytenr(cow, cow->start);
277 278
	btrfs_set_header_generation(cow, trans->transid);
	btrfs_set_header_owner(cow, root->root_key.objectid);
279
	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
280

Y
Yan Zheng 已提交
281 282 283 284
	write_extent_buffer(cow, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(cow),
			    BTRFS_FSID_SIZE);

285 286
	WARN_ON(btrfs_header_generation(buf) > trans->transid);
	if (btrfs_header_generation(buf) != trans->transid) {
Z
Zheng Yan 已提交
287 288
		u32 nr_extents;
		ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
289 290
		if (ret)
			return ret;
Z
Zheng Yan 已提交
291 292 293

		ret = btrfs_cache_ref(trans, root, buf, nr_extents);
		WARN_ON(ret);
Z
Zheng Yan 已提交
294 295 296 297
	} else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
		/*
		 * There are only two places that can drop reference to
		 * tree blocks owned by living reloc trees, one is here,
Y
Yan Zheng 已提交
298
		 * the other place is btrfs_drop_subtree. In both places,
Z
Zheng Yan 已提交
299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314
		 * we check reference count while tree block is locked.
		 * Furthermore, if reference count is one, it won't get
		 * increased by someone else.
		 */
		u32 refs;
		ret = btrfs_lookup_extent_ref(trans, root, buf->start,
					      buf->len, &refs);
		BUG_ON(ret);
		if (refs == 1) {
			ret = btrfs_update_ref(trans, root, buf, cow,
					       0, nritems);
			clean_tree_block(trans, root, buf);
		} else {
			ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
		}
		BUG_ON(ret);
315
	} else {
Z
Zheng Yan 已提交
316 317 318
		ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
		if (ret)
			return ret;
319 320 321
		clean_tree_block(trans, root, buf);
	}

Z
Zheng Yan 已提交
322 323 324 325 326
	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
		ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
		WARN_ON(ret);
	}

C
Chris Mason 已提交
327
	if (buf == root->node) {
328 329 330
		WARN_ON(parent && parent != buf);

		spin_lock(&root->node_lock);
C
Chris Mason 已提交
331
		root->node = cow;
332
		extent_buffer_get(cow);
333 334
		spin_unlock(&root->node_lock);

C
Chris Mason 已提交
335
		if (buf != root->commit_root) {
336
			btrfs_free_extent(trans, root, buf->start,
Z
Zheng Yan 已提交
337 338 339
					  buf->len, buf->start,
					  root->root_key.objectid,
					  btrfs_header_generation(buf),
340
					  level, 1);
C
Chris Mason 已提交
341
		}
342
		free_extent_buffer(buf);
343
		add_root_to_dirty_list(root);
C
Chris Mason 已提交
344
	} else {
345
		btrfs_set_node_blockptr(parent, parent_slot,
346
					cow->start);
347 348 349
		WARN_ON(trans->transid == 0);
		btrfs_set_node_ptr_generation(parent, parent_slot,
					      trans->transid);
C
Chris Mason 已提交
350
		btrfs_mark_buffer_dirty(parent);
351
		WARN_ON(btrfs_header_generation(parent) != trans->transid);
352
		btrfs_free_extent(trans, root, buf->start, buf->len,
Z
Zheng Yan 已提交
353
				  parent_start, btrfs_header_owner(parent),
354
				  btrfs_header_generation(parent), level, 1);
C
Chris Mason 已提交
355
	}
356 357
	if (unlock_orig)
		btrfs_tree_unlock(buf);
358
	free_extent_buffer(buf);
C
Chris Mason 已提交
359
	btrfs_mark_buffer_dirty(cow);
C
Chris Mason 已提交
360
	*cow_ret = cow;
C
Chris Mason 已提交
361 362 363
	return 0;
}

C
Chris Mason 已提交
364 365 366 367 368
/*
 * cows a single block, see __btrfs_cow_block for the real work.
 * This version of it has extra checks so that a block isn't cow'd more than
 * once per transaction, as long as it hasn't been written yet
 */
C
Chris Mason 已提交
369
noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
370 371
		    struct btrfs_root *root, struct extent_buffer *buf,
		    struct extent_buffer *parent, int parent_slot,
372
		    struct extent_buffer **cow_ret, u64 prealloc_dest)
373 374
{
	u64 search_start;
375
	int ret;
C
Chris Mason 已提交
376

377
	if (trans->transaction != root->fs_info->running_transaction) {
C
Chris Mason 已提交
378 379 380
		printk(KERN_CRIT "trans %llu running %llu\n",
		       (unsigned long long)trans->transid,
		       (unsigned long long)
381 382 383 384
		       root->fs_info->running_transaction->transid);
		WARN_ON(1);
	}
	if (trans->transid != root->fs_info->generation) {
C
Chris Mason 已提交
385 386 387
		printk(KERN_CRIT "trans %llu running %llu\n",
		       (unsigned long long)trans->transid,
		       (unsigned long long)root->fs_info->generation);
388 389
		WARN_ON(1);
	}
C
Chris Mason 已提交
390

391
	spin_lock(&root->fs_info->hash_lock);
392 393
	if (btrfs_header_generation(buf) == trans->transid &&
	    btrfs_header_owner(buf) == root->root_key.objectid &&
394
	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
395
		*cow_ret = buf;
396
		spin_unlock(&root->fs_info->hash_lock);
397
		WARN_ON(prealloc_dest);
398 399
		return 0;
	}
400
	spin_unlock(&root->fs_info->hash_lock);
401
	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
402
	ret = __btrfs_cow_block(trans, root, buf, parent,
403 404
				 parent_slot, cow_ret, search_start, 0,
				 prealloc_dest);
405
	return ret;
406 407
}

C
Chris Mason 已提交
408 409 410 411
/*
 * helper function for defrag to decide if two blocks pointed to by a
 * node are actually close by
 */
412
static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
413
{
414
	if (blocknr < other && other - (blocknr + blocksize) < 32768)
415
		return 1;
416
	if (blocknr > other && blocknr - (other + blocksize) < 32768)
417 418 419 420
		return 1;
	return 0;
}

421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444
/*
 * compare two keys in a memcmp fashion
 */
static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
{
	struct btrfs_key k1;

	btrfs_disk_key_to_cpu(&k1, disk);

	if (k1.objectid > k2->objectid)
		return 1;
	if (k1.objectid < k2->objectid)
		return -1;
	if (k1.type > k2->type)
		return 1;
	if (k1.type < k2->type)
		return -1;
	if (k1.offset > k2->offset)
		return 1;
	if (k1.offset < k2->offset)
		return -1;
	return 0;
}

445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463
/*
 * same as comp_keys only with two btrfs_key's
 */
static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
{
	if (k1->objectid > k2->objectid)
		return 1;
	if (k1->objectid < k2->objectid)
		return -1;
	if (k1->type > k2->type)
		return 1;
	if (k1->type < k2->type)
		return -1;
	if (k1->offset > k2->offset)
		return 1;
	if (k1->offset < k2->offset)
		return -1;
	return 0;
}
464

C
Chris Mason 已提交
465 466 467 468 469
/*
 * this is used by the defrag code to go through all the
 * leaves pointed to by a node and reallocate them so that
 * disk order is close to key order
 */
470
int btrfs_realloc_node(struct btrfs_trans_handle *trans,
471
		       struct btrfs_root *root, struct extent_buffer *parent,
472 473
		       int start_slot, int cache_only, u64 *last_ret,
		       struct btrfs_key *progress)
474
{
475
	struct extent_buffer *cur;
476
	u64 blocknr;
477
	u64 gen;
478 479
	u64 search_start = *last_ret;
	u64 last_block = 0;
480 481 482 483 484
	u64 other;
	u32 parent_nritems;
	int end_slot;
	int i;
	int err = 0;
485
	int parent_level;
486 487
	int uptodate;
	u32 blocksize;
488 489
	int progress_passed = 0;
	struct btrfs_disk_key disk_key;
490

491 492 493 494
	parent_level = btrfs_header_level(parent);
	if (cache_only && parent_level != 1)
		return 0;

C
Chris Mason 已提交
495
	if (trans->transaction != root->fs_info->running_transaction)
496
		WARN_ON(1);
C
Chris Mason 已提交
497
	if (trans->transid != root->fs_info->generation)
498
		WARN_ON(1);
499

500 501
	parent_nritems = btrfs_header_nritems(parent);
	blocksize = btrfs_level_size(root, parent_level - 1);
502 503 504 505 506 507 508
	end_slot = parent_nritems;

	if (parent_nritems == 1)
		return 0;

	for (i = start_slot; i < end_slot; i++) {
		int close = 1;
509

510 511 512 513 514 515 516 517
		if (!parent->map_token) {
			map_extent_buffer(parent,
					btrfs_node_key_ptr_offset(i),
					sizeof(struct btrfs_key_ptr),
					&parent->map_token, &parent->kaddr,
					&parent->map_start, &parent->map_len,
					KM_USER1);
		}
518 519 520 521 522
		btrfs_node_key(parent, &disk_key, i);
		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
			continue;

		progress_passed = 1;
523
		blocknr = btrfs_node_blockptr(parent, i);
524
		gen = btrfs_node_ptr_generation(parent, i);
525 526
		if (last_block == 0)
			last_block = blocknr;
527

528
		if (i > 0) {
529 530
			other = btrfs_node_blockptr(parent, i - 1);
			close = close_blocks(blocknr, other, blocksize);
531
		}
C
Chris Mason 已提交
532
		if (!close && i < end_slot - 2) {
533 534
			other = btrfs_node_blockptr(parent, i + 1);
			close = close_blocks(blocknr, other, blocksize);
535
		}
536 537
		if (close) {
			last_block = blocknr;
538
			continue;
539
		}
540 541 542 543 544
		if (parent->map_token) {
			unmap_extent_buffer(parent, parent->map_token,
					    KM_USER1);
			parent->map_token = NULL;
		}
545

546 547
		cur = btrfs_find_tree_block(root, blocknr, blocksize);
		if (cur)
548
			uptodate = btrfs_buffer_uptodate(cur, gen);
549 550
		else
			uptodate = 0;
551
		if (!cur || !uptodate) {
552
			if (cache_only) {
553
				free_extent_buffer(cur);
554 555
				continue;
			}
556 557
			if (!cur) {
				cur = read_tree_block(root, blocknr,
558
							 blocksize, gen);
559
			} else if (!uptodate) {
560
				btrfs_read_buffer(cur, gen);
561
			}
562
		}
563
		if (search_start == 0)
564
			search_start = last_block;
565

566
		btrfs_tree_lock(cur);
567
		err = __btrfs_cow_block(trans, root, cur, parent, i,
568
					&cur, search_start,
569
					min(16 * blocksize,
570
					    (end_slot - i) * blocksize), 0);
Y
Yan 已提交
571
		if (err) {
572
			btrfs_tree_unlock(cur);
573
			free_extent_buffer(cur);
574
			break;
Y
Yan 已提交
575
		}
576 577
		search_start = cur->start;
		last_block = cur->start;
578
		*last_ret = search_start;
579 580
		btrfs_tree_unlock(cur);
		free_extent_buffer(cur);
581
	}
582 583 584 585 586
	if (parent->map_token) {
		unmap_extent_buffer(parent, parent->map_token,
				    KM_USER1);
		parent->map_token = NULL;
	}
587 588 589
	return err;
}

C
Chris Mason 已提交
590 591 592 593 594
/*
 * The leaf data grows from end-to-front in the node.
 * this returns the address of the start of the last item,
 * which is the stop of the leaf data stack
 */
C
Chris Mason 已提交
595
static inline unsigned int leaf_data_end(struct btrfs_root *root,
596
					 struct extent_buffer *leaf)
597
{
598
	u32 nr = btrfs_header_nritems(leaf);
599
	if (nr == 0)
C
Chris Mason 已提交
600
		return BTRFS_LEAF_DATA_SIZE(root);
601
	return btrfs_item_offset_nr(leaf, nr - 1);
602 603
}

C
Chris Mason 已提交
604 605 606 607
/*
 * extra debugging checks to make sure all the items in a key are
 * well formed and in the proper order
 */
C
Chris Mason 已提交
608 609
static int check_node(struct btrfs_root *root, struct btrfs_path *path,
		      int level)
C
Chris Mason 已提交
610
{
611 612 613 614
	struct extent_buffer *parent = NULL;
	struct extent_buffer *node = path->nodes[level];
	struct btrfs_disk_key parent_key;
	struct btrfs_disk_key node_key;
C
Chris Mason 已提交
615
	int parent_slot;
616 617
	int slot;
	struct btrfs_key cpukey;
618
	u32 nritems = btrfs_header_nritems(node);
C
Chris Mason 已提交
619 620

	if (path->nodes[level + 1])
621
		parent = path->nodes[level + 1];
A
Aneesh 已提交
622

623
	slot = path->slots[level];
624 625
	BUG_ON(nritems == 0);
	if (parent) {
A
Aneesh 已提交
626
		parent_slot = path->slots[level + 1];
627 628 629
		btrfs_node_key(parent, &parent_key, parent_slot);
		btrfs_node_key(node, &node_key, 0);
		BUG_ON(memcmp(&parent_key, &node_key,
C
Chris Mason 已提交
630
			      sizeof(struct btrfs_disk_key)));
631
		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
632
		       btrfs_header_bytenr(node));
C
Chris Mason 已提交
633
	}
C
Chris Mason 已提交
634
	BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
635
	if (slot != 0) {
636 637 638
		btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
		btrfs_node_key(node, &node_key, slot);
		BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
639 640
	}
	if (slot < nritems - 1) {
641 642 643
		btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
		btrfs_node_key(node, &node_key, slot);
		BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
C
Chris Mason 已提交
644 645 646 647
	}
	return 0;
}

C
Chris Mason 已提交
648 649 650 651
/*
 * extra checking to make sure all the items in a leaf are
 * well formed and in the proper order
 */
C
Chris Mason 已提交
652 653
static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
		      int level)
C
Chris Mason 已提交
654
{
655 656
	struct extent_buffer *leaf = path->nodes[level];
	struct extent_buffer *parent = NULL;
C
Chris Mason 已提交
657
	int parent_slot;
658
	struct btrfs_key cpukey;
659 660 661
	struct btrfs_disk_key parent_key;
	struct btrfs_disk_key leaf_key;
	int slot = path->slots[0];
662

663
	u32 nritems = btrfs_header_nritems(leaf);
C
Chris Mason 已提交
664 665

	if (path->nodes[level + 1])
666
		parent = path->nodes[level + 1];
667 668 669 670 671

	if (nritems == 0)
		return 0;

	if (parent) {
A
Aneesh 已提交
672
		parent_slot = path->slots[level + 1];
673 674
		btrfs_node_key(parent, &parent_key, parent_slot);
		btrfs_item_key(leaf, &leaf_key, 0);
675

676
		BUG_ON(memcmp(&parent_key, &leaf_key,
C
Chris Mason 已提交
677
		       sizeof(struct btrfs_disk_key)));
678
		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
679
		       btrfs_header_bytenr(leaf));
680 681 682 683 684 685
	}
	if (slot != 0 && slot < nritems - 1) {
		btrfs_item_key(leaf, &leaf_key, slot);
		btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
		if (comp_keys(&leaf_key, &cpukey) <= 0) {
			btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
686
			printk(KERN_CRIT "slot %d offset bad key\n", slot);
687 688 689 690 691
			BUG_ON(1);
		}
		if (btrfs_item_offset_nr(leaf, slot - 1) !=
		       btrfs_item_end_nr(leaf, slot)) {
			btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
692
			printk(KERN_CRIT "slot %d offset bad\n", slot);
693 694
			BUG_ON(1);
		}
695 696
	}
	if (slot < nritems - 1) {
697 698 699 700 701 702
		btrfs_item_key(leaf, &leaf_key, slot);
		btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
		BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
		if (btrfs_item_offset_nr(leaf, slot) !=
			btrfs_item_end_nr(leaf, slot + 1)) {
			btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
703
			printk(KERN_CRIT "slot %d offset bad\n", slot);
704 705
			BUG_ON(1);
		}
C
Chris Mason 已提交
706
	}
707 708
	BUG_ON(btrfs_item_offset_nr(leaf, 0) +
	       btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
C
Chris Mason 已提交
709 710 711
	return 0;
}

C
Chris Mason 已提交
712
static noinline int check_block(struct btrfs_root *root,
713
				struct btrfs_path *path, int level)
C
Chris Mason 已提交
714
{
715
	return 0;
C
Chris Mason 已提交
716
	if (level == 0)
C
Chris Mason 已提交
717 718
		return check_leaf(root, path, level);
	return check_node(root, path, level);
C
Chris Mason 已提交
719 720
}

C
Chris Mason 已提交
721
/*
722 723 724
 * search for key in the extent_buffer.  The items start at offset p,
 * and they are item_size apart.  There are 'max' items in p.
 *
C
Chris Mason 已提交
725 726 727 728 729 730
 * the slot in the array is returned via slot, and it points to
 * the place where you would insert key if it is not found in
 * the array.
 *
 * slot may point to max if the key is bigger than all of the keys
 */
731 732 733 734
static noinline int generic_bin_search(struct extent_buffer *eb,
				       unsigned long p,
				       int item_size, struct btrfs_key *key,
				       int max, int *slot)
735 736 737 738 739
{
	int low = 0;
	int high = max;
	int mid;
	int ret;
740
	struct btrfs_disk_key *tmp = NULL;
741 742 743 744 745 746
	struct btrfs_disk_key unaligned;
	unsigned long offset;
	char *map_token = NULL;
	char *kaddr = NULL;
	unsigned long map_start = 0;
	unsigned long map_len = 0;
747
	int err;
748

C
Chris Mason 已提交
749
	while (low < high) {
750
		mid = (low + high) / 2;
751 752 753 754 755
		offset = p + mid * item_size;

		if (!map_token || offset < map_start ||
		    (offset + sizeof(struct btrfs_disk_key)) >
		    map_start + map_len) {
756
			if (map_token) {
757
				unmap_extent_buffer(eb, map_token, KM_USER0);
758 759
				map_token = NULL;
			}
760 761

			err = map_private_extent_buffer(eb, offset,
762 763 764 765 766 767 768 769 770 771 772 773
						sizeof(struct btrfs_disk_key),
						&map_token, &kaddr,
						&map_start, &map_len, KM_USER0);

			if (!err) {
				tmp = (struct btrfs_disk_key *)(kaddr + offset -
							map_start);
			} else {
				read_extent_buffer(eb, &unaligned,
						   offset, sizeof(unaligned));
				tmp = &unaligned;
			}
774 775 776 777 778

		} else {
			tmp = (struct btrfs_disk_key *)(kaddr + offset -
							map_start);
		}
779 780 781 782 783 784 785 786
		ret = comp_keys(tmp, key);

		if (ret < 0)
			low = mid + 1;
		else if (ret > 0)
			high = mid;
		else {
			*slot = mid;
787 788
			if (map_token)
				unmap_extent_buffer(eb, map_token, KM_USER0);
789 790 791 792
			return 0;
		}
	}
	*slot = low;
793 794
	if (map_token)
		unmap_extent_buffer(eb, map_token, KM_USER0);
795 796 797
	return 1;
}

C
Chris Mason 已提交
798 799 800 801
/*
 * simple bin_search frontend that does the right thing for
 * leaves vs nodes
 */
802 803
static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
		      int level, int *slot)
804
{
805 806 807
	if (level == 0) {
		return generic_bin_search(eb,
					  offsetof(struct btrfs_leaf, items),
C
Chris Mason 已提交
808
					  sizeof(struct btrfs_item),
809
					  key, btrfs_header_nritems(eb),
810
					  slot);
811
	} else {
812 813
		return generic_bin_search(eb,
					  offsetof(struct btrfs_node, ptrs),
C
Chris Mason 已提交
814
					  sizeof(struct btrfs_key_ptr),
815
					  key, btrfs_header_nritems(eb),
816
					  slot);
817 818 819 820
	}
	return -1;
}

C
Chris Mason 已提交
821 822 823 824
/* given a node and slot number, this reads the blocks it points to.  The
 * extent buffer is returned with a reference taken (but unlocked).
 * NULL is returned on error.
 */
825
static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
826
				   struct extent_buffer *parent, int slot)
827
{
828
	int level = btrfs_header_level(parent);
829 830
	if (slot < 0)
		return NULL;
831
	if (slot >= btrfs_header_nritems(parent))
832
		return NULL;
833 834 835

	BUG_ON(level == 0);

836
	return read_tree_block(root, btrfs_node_blockptr(parent, slot),
837 838
		       btrfs_level_size(root, level - 1),
		       btrfs_node_ptr_generation(parent, slot));
839 840
}

C
Chris Mason 已提交
841 842 843 844 845
/*
 * node level balancing, used to make sure nodes are in proper order for
 * item deletion.  We balance from the top down, so we have to make sure
 * that a deletion won't leave an node completely empty later on.
 */
846
static noinline int balance_level(struct btrfs_trans_handle *trans,
847 848
			 struct btrfs_root *root,
			 struct btrfs_path *path, int level)
849
{
850 851 852 853
	struct extent_buffer *right = NULL;
	struct extent_buffer *mid;
	struct extent_buffer *left = NULL;
	struct extent_buffer *parent = NULL;
854 855 856 857
	int ret = 0;
	int wret;
	int pslot;
	int orig_slot = path->slots[level];
858
	int err_on_enospc = 0;
859
	u64 orig_ptr;
860 861 862 863

	if (level == 0)
		return 0;

864
	mid = path->nodes[level];
865
	WARN_ON(!path->locks[level]);
866 867
	WARN_ON(btrfs_header_generation(mid) != trans->transid);

868
	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
869

C
Chris Mason 已提交
870
	if (level < BTRFS_MAX_LEVEL - 1)
871
		parent = path->nodes[level + 1];
872 873
	pslot = path->slots[level + 1];

C
Chris Mason 已提交
874 875 876 877
	/*
	 * deal with the case where there is only one pointer in the root
	 * by promoting the node below to a root
	 */
878 879
	if (!parent) {
		struct extent_buffer *child;
880

881
		if (btrfs_header_nritems(mid) != 1)
882 883 884
			return 0;

		/* promote the child to a root */
885
		child = read_node_slot(root, mid, 0);
886
		btrfs_tree_lock(child);
887
		BUG_ON(!child);
888
		ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
889 890
		BUG_ON(ret);

891
		spin_lock(&root->node_lock);
892
		root->node = child;
893 894
		spin_unlock(&root->node_lock);

Z
Zheng Yan 已提交
895 896 897
		ret = btrfs_update_extent_ref(trans, root, child->start,
					      mid->start, child->start,
					      root->root_key.objectid,
898
					      trans->transid, level - 1);
Z
Zheng Yan 已提交
899 900
		BUG_ON(ret);

901
		add_root_to_dirty_list(root);
902 903
		btrfs_tree_unlock(child);
		path->locks[level] = 0;
904
		path->nodes[level] = NULL;
905
		clean_tree_block(trans, root, mid);
906
		btrfs_tree_unlock(mid);
907
		/* once for the path */
908
		free_extent_buffer(mid);
909
		ret = btrfs_free_extent(trans, root, mid->start, mid->len,
Z
Zheng Yan 已提交
910
					mid->start, root->root_key.objectid,
911 912
					btrfs_header_generation(mid),
					level, 1);
913
		/* once for the root ptr */
914
		free_extent_buffer(mid);
915
		return ret;
916
	}
917
	if (btrfs_header_nritems(mid) >
C
Chris Mason 已提交
918
	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
919 920
		return 0;

921
	if (btrfs_header_nritems(mid) < 2)
922 923
		err_on_enospc = 1;

924 925
	left = read_node_slot(root, parent, pslot - 1);
	if (left) {
926
		btrfs_tree_lock(left);
927
		wret = btrfs_cow_block(trans, root, left,
928
				       parent, pslot - 1, &left, 0);
929 930 931 932
		if (wret) {
			ret = wret;
			goto enospc;
		}
933
	}
934 935
	right = read_node_slot(root, parent, pslot + 1);
	if (right) {
936
		btrfs_tree_lock(right);
937
		wret = btrfs_cow_block(trans, root, right,
938
				       parent, pslot + 1, &right, 0);
939 940 941 942 943 944 945
		if (wret) {
			ret = wret;
			goto enospc;
		}
	}

	/* first, try to make some room in the middle buffer */
946 947
	if (left) {
		orig_slot += btrfs_header_nritems(left);
948
		wret = push_node_left(trans, root, left, mid, 1);
949 950
		if (wret < 0)
			ret = wret;
951
		if (btrfs_header_nritems(mid) < 2)
952
			err_on_enospc = 1;
953
	}
954 955 956 957

	/*
	 * then try to empty the right most buffer into the middle
	 */
958
	if (right) {
959
		wret = push_node_left(trans, root, mid, right, 1);
960
		if (wret < 0 && wret != -ENOSPC)
961
			ret = wret;
962
		if (btrfs_header_nritems(right) == 0) {
963
			u64 bytenr = right->start;
964
			u64 generation = btrfs_header_generation(parent);
965 966
			u32 blocksize = right->len;

967
			clean_tree_block(trans, root, right);
968
			btrfs_tree_unlock(right);
969
			free_extent_buffer(right);
970
			right = NULL;
971 972
			wret = del_ptr(trans, root, path, level + 1, pslot +
				       1);
973 974
			if (wret)
				ret = wret;
975
			wret = btrfs_free_extent(trans, root, bytenr,
Z
Zheng Yan 已提交
976
						 blocksize, parent->start,
977
						 btrfs_header_owner(parent),
978
						 generation, level, 1);
979 980 981
			if (wret)
				ret = wret;
		} else {
982 983 984 985
			struct btrfs_disk_key right_key;
			btrfs_node_key(right, &right_key, 0);
			btrfs_set_node_key(parent, &right_key, pslot + 1);
			btrfs_mark_buffer_dirty(parent);
986 987
		}
	}
988
	if (btrfs_header_nritems(mid) == 1) {
989 990 991 992 993 994 995 996 997
		/*
		 * we're not allowed to leave a node with one item in the
		 * tree during a delete.  A deletion from lower in the tree
		 * could try to delete the only pointer in this node.
		 * So, pull some keys from the left.
		 * There has to be a left pointer at this point because
		 * otherwise we would have pulled some pointers from the
		 * right
		 */
998 999
		BUG_ON(!left);
		wret = balance_node_right(trans, root, mid, left);
1000
		if (wret < 0) {
1001
			ret = wret;
1002 1003
			goto enospc;
		}
1004 1005 1006 1007 1008
		if (wret == 1) {
			wret = push_node_left(trans, root, left, mid, 1);
			if (wret < 0)
				ret = wret;
		}
1009 1010
		BUG_ON(wret == 1);
	}
1011
	if (btrfs_header_nritems(mid) == 0) {
1012
		/* we've managed to empty the middle node, drop it */
1013
		u64 root_gen = btrfs_header_generation(parent);
1014 1015
		u64 bytenr = mid->start;
		u32 blocksize = mid->len;
1016

1017
		clean_tree_block(trans, root, mid);
1018
		btrfs_tree_unlock(mid);
1019
		free_extent_buffer(mid);
1020
		mid = NULL;
1021
		wret = del_ptr(trans, root, path, level + 1, pslot);
1022 1023
		if (wret)
			ret = wret;
1024
		wret = btrfs_free_extent(trans, root, bytenr, blocksize,
Z
Zheng Yan 已提交
1025
					 parent->start,
1026
					 btrfs_header_owner(parent),
1027
					 root_gen, level, 1);
1028 1029
		if (wret)
			ret = wret;
1030 1031
	} else {
		/* update the parent key to reflect our changes */
1032 1033 1034 1035
		struct btrfs_disk_key mid_key;
		btrfs_node_key(mid, &mid_key, 0);
		btrfs_set_node_key(parent, &mid_key, pslot);
		btrfs_mark_buffer_dirty(parent);
1036
	}
1037

1038
	/* update the path */
1039 1040 1041
	if (left) {
		if (btrfs_header_nritems(left) > orig_slot) {
			extent_buffer_get(left);
1042
			/* left was locked after cow */
1043
			path->nodes[level] = left;
1044 1045
			path->slots[level + 1] -= 1;
			path->slots[level] = orig_slot;
1046 1047
			if (mid) {
				btrfs_tree_unlock(mid);
1048
				free_extent_buffer(mid);
1049
			}
1050
		} else {
1051
			orig_slot -= btrfs_header_nritems(left);
1052 1053 1054
			path->slots[level] = orig_slot;
		}
	}
1055
	/* double check we haven't messed things up */
C
Chris Mason 已提交
1056
	check_block(root, path, level);
C
Chris Mason 已提交
1057
	if (orig_ptr !=
1058
	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1059
		BUG();
1060
enospc:
1061 1062
	if (right) {
		btrfs_tree_unlock(right);
1063
		free_extent_buffer(right);
1064 1065 1066 1067
	}
	if (left) {
		if (path->nodes[level] != left)
			btrfs_tree_unlock(left);
1068
		free_extent_buffer(left);
1069
	}
1070 1071 1072
	return ret;
}

C
Chris Mason 已提交
1073 1074 1075 1076
/* Node balancing for insertion.  Here we only split or push nodes around
 * when they are completely full.  This is also done top down, so we
 * have to be pessimistic.
 */
C
Chris Mason 已提交
1077
static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1078 1079
					  struct btrfs_root *root,
					  struct btrfs_path *path, int level)
1080
{
1081 1082 1083 1084
	struct extent_buffer *right = NULL;
	struct extent_buffer *mid;
	struct extent_buffer *left = NULL;
	struct extent_buffer *parent = NULL;
1085 1086 1087 1088 1089 1090 1091 1092 1093
	int ret = 0;
	int wret;
	int pslot;
	int orig_slot = path->slots[level];
	u64 orig_ptr;

	if (level == 0)
		return 1;

1094
	mid = path->nodes[level];
1095
	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1096 1097 1098
	orig_ptr = btrfs_node_blockptr(mid, orig_slot);

	if (level < BTRFS_MAX_LEVEL - 1)
1099
		parent = path->nodes[level + 1];
1100 1101
	pslot = path->slots[level + 1];

1102
	if (!parent)
1103 1104
		return 1;

1105
	left = read_node_slot(root, parent, pslot - 1);
1106 1107

	/* first, try to make some room in the middle buffer */
1108
	if (left) {
1109
		u32 left_nr;
1110 1111

		btrfs_tree_lock(left);
1112
		left_nr = btrfs_header_nritems(left);
C
Chris Mason 已提交
1113 1114 1115
		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
			wret = 1;
		} else {
1116
			ret = btrfs_cow_block(trans, root, left, parent,
1117
					      pslot - 1, &left, 0);
1118 1119 1120 1121
			if (ret)
				wret = 1;
			else {
				wret = push_node_left(trans, root,
1122
						      left, mid, 0);
1123
			}
C
Chris Mason 已提交
1124
		}
1125 1126 1127
		if (wret < 0)
			ret = wret;
		if (wret == 0) {
1128
			struct btrfs_disk_key disk_key;
1129
			orig_slot += left_nr;
1130 1131 1132 1133 1134
			btrfs_node_key(mid, &disk_key, 0);
			btrfs_set_node_key(parent, &disk_key, pslot);
			btrfs_mark_buffer_dirty(parent);
			if (btrfs_header_nritems(left) > orig_slot) {
				path->nodes[level] = left;
1135 1136
				path->slots[level + 1] -= 1;
				path->slots[level] = orig_slot;
1137
				btrfs_tree_unlock(mid);
1138
				free_extent_buffer(mid);
1139 1140
			} else {
				orig_slot -=
1141
					btrfs_header_nritems(left);
1142
				path->slots[level] = orig_slot;
1143
				btrfs_tree_unlock(left);
1144
				free_extent_buffer(left);
1145 1146 1147
			}
			return 0;
		}
1148
		btrfs_tree_unlock(left);
1149
		free_extent_buffer(left);
1150
	}
1151
	right = read_node_slot(root, parent, pslot + 1);
1152 1153 1154 1155

	/*
	 * then try to empty the right most buffer into the middle
	 */
1156
	if (right) {
C
Chris Mason 已提交
1157
		u32 right_nr;
1158
		btrfs_tree_lock(right);
1159
		right_nr = btrfs_header_nritems(right);
C
Chris Mason 已提交
1160 1161 1162
		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
			wret = 1;
		} else {
1163 1164
			ret = btrfs_cow_block(trans, root, right,
					      parent, pslot + 1,
1165
					      &right, 0);
1166 1167 1168 1169
			if (ret)
				wret = 1;
			else {
				wret = balance_node_right(trans, root,
1170
							  right, mid);
1171
			}
C
Chris Mason 已提交
1172
		}
1173 1174 1175
		if (wret < 0)
			ret = wret;
		if (wret == 0) {
1176 1177 1178 1179 1180 1181 1182 1183
			struct btrfs_disk_key disk_key;

			btrfs_node_key(right, &disk_key, 0);
			btrfs_set_node_key(parent, &disk_key, pslot + 1);
			btrfs_mark_buffer_dirty(parent);

			if (btrfs_header_nritems(mid) <= orig_slot) {
				path->nodes[level] = right;
1184 1185
				path->slots[level + 1] += 1;
				path->slots[level] = orig_slot -
1186
					btrfs_header_nritems(mid);
1187
				btrfs_tree_unlock(mid);
1188
				free_extent_buffer(mid);
1189
			} else {
1190
				btrfs_tree_unlock(right);
1191
				free_extent_buffer(right);
1192 1193 1194
			}
			return 0;
		}
1195
		btrfs_tree_unlock(right);
1196
		free_extent_buffer(right);
1197 1198 1199 1200
	}
	return 1;
}

1201
/*
C
Chris Mason 已提交
1202 1203
 * readahead one full node of leaves, finding things that are close
 * to the block in 'slot', and triggering ra on them.
1204
 */
1205 1206 1207
static noinline void reada_for_search(struct btrfs_root *root,
				      struct btrfs_path *path,
				      int level, int slot, u64 objectid)
1208
{
1209
	struct extent_buffer *node;
1210
	struct btrfs_disk_key disk_key;
1211 1212
	u32 nritems;
	u64 search;
1213 1214 1215
	u64 lowest_read;
	u64 highest_read;
	u64 nread = 0;
1216
	int direction = path->reada;
1217
	struct extent_buffer *eb;
1218 1219 1220
	u32 nr;
	u32 blocksize;
	u32 nscan = 0;
1221

1222
	if (level != 1)
1223 1224 1225
		return;

	if (!path->nodes[level])
1226 1227
		return;

1228
	node = path->nodes[level];
1229

1230
	search = btrfs_node_blockptr(node, slot);
1231 1232
	blocksize = btrfs_level_size(root, level - 1);
	eb = btrfs_find_tree_block(root, search, blocksize);
1233 1234
	if (eb) {
		free_extent_buffer(eb);
1235 1236 1237
		return;
	}

1238 1239 1240
	highest_read = search;
	lowest_read = search;

1241
	nritems = btrfs_header_nritems(node);
1242
	nr = slot;
C
Chris Mason 已提交
1243
	while (1) {
1244 1245 1246 1247 1248 1249 1250 1251
		if (direction < 0) {
			if (nr == 0)
				break;
			nr--;
		} else if (direction > 0) {
			nr++;
			if (nr >= nritems)
				break;
1252
		}
1253 1254 1255 1256 1257
		if (path->reada < 0 && objectid) {
			btrfs_node_key(node, &disk_key, nr);
			if (btrfs_disk_key_objectid(&disk_key) != objectid)
				break;
		}
1258 1259
		search = btrfs_node_blockptr(node, nr);
		if ((search >= lowest_read && search <= highest_read) ||
1260 1261
		    (search < lowest_read && lowest_read - search <= 16384) ||
		    (search > highest_read && search - highest_read <= 16384)) {
1262 1263
			readahead_tree_block(root, search, blocksize,
				     btrfs_node_ptr_generation(node, nr));
1264 1265 1266
			nread += blocksize;
		}
		nscan++;
1267
		if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32))
1268
			break;
C
Chris Mason 已提交
1269 1270

		if (nread > (256 * 1024) || nscan > 128)
1271 1272 1273 1274 1275 1276
			break;

		if (search < lowest_read)
			lowest_read = search;
		if (search > highest_read)
			highest_read = search;
1277 1278
	}
}
1279

C
Chris Mason 已提交
1280
/*
C
Chris Mason 已提交
1281 1282 1283 1284
 * when we walk down the tree, it is usually safe to unlock the higher layers
 * in the tree.  The exceptions are when our path goes through slot 0, because
 * operations on the tree might require changing key pointers higher up in the
 * tree.
C
Chris Mason 已提交
1285
 *
C
Chris Mason 已提交
1286 1287 1288
 * callers might also have set path->keep_locks, which tells this code to keep
 * the lock if the path points to the last slot in the block.  This is part of
 * walking through the tree, and selecting the next slot in the higher block.
C
Chris Mason 已提交
1289
 *
C
Chris Mason 已提交
1290 1291
 * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
 * if lowest_unlock is 1, level 0 won't be unlocked
C
Chris Mason 已提交
1292
 */
1293 1294
static noinline void unlock_up(struct btrfs_path *path, int level,
			       int lowest_unlock)
1295 1296 1297
{
	int i;
	int skip_level = level;
1298
	int no_skips = 0;
1299 1300 1301 1302 1303 1304 1305
	struct extent_buffer *t;

	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
		if (!path->nodes[i])
			break;
		if (!path->locks[i])
			break;
1306
		if (!no_skips && path->slots[i] == 0) {
1307 1308 1309
			skip_level = i + 1;
			continue;
		}
1310
		if (!no_skips && path->keep_locks) {
1311 1312 1313
			u32 nritems;
			t = path->nodes[i];
			nritems = btrfs_header_nritems(t);
1314
			if (nritems < 1 || path->slots[i] >= nritems - 1) {
1315 1316 1317 1318
				skip_level = i + 1;
				continue;
			}
		}
1319 1320 1321
		if (skip_level < i && i >= lowest_unlock)
			no_skips = 1;

1322 1323 1324 1325 1326 1327 1328 1329
		t = path->nodes[i];
		if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
			btrfs_tree_unlock(t);
			path->locks[i] = 0;
		}
	}
}

C
Chris Mason 已提交
1330 1331 1332 1333 1334 1335
/*
 * look for key in the tree.  path is filled in with nodes along the way
 * if key is found, we return zero and you can find the item in the leaf
 * level of the path (level 0)
 *
 * If the key isn't found, the path points to the slot where it should
C
Chris Mason 已提交
1336 1337
 * be inserted, and 1 is returned.  If there are other errors during the
 * search a negative error number is returned.
C
Chris Mason 已提交
1338 1339 1340 1341
 *
 * if ins_len > 0, nodes and leaves will be split as we walk down the
 * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
 * possible)
C
Chris Mason 已提交
1342
 */
1343 1344 1345
int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
		      *root, struct btrfs_key *key, struct btrfs_path *p, int
		      ins_len, int cow)
1346
{
1347
	struct extent_buffer *b;
1348
	struct extent_buffer *tmp;
1349 1350 1351
	int slot;
	int ret;
	int level;
1352
	int should_reada = p->reada;
1353
	int lowest_unlock = 1;
1354
	int blocksize;
1355
	u8 lowest_level = 0;
1356 1357
	u64 blocknr;
	u64 gen;
1358
	struct btrfs_key prealloc_block;
1359

1360
	lowest_level = p->lowest_level;
1361
	WARN_ON(lowest_level && ins_len > 0);
C
Chris Mason 已提交
1362
	WARN_ON(p->nodes[0] != NULL);
1363

1364 1365
	if (ins_len < 0)
		lowest_unlock = 2;
1366 1367 1368

	prealloc_block.objectid = 0;

1369
again:
1370 1371 1372 1373
	if (p->skip_locking)
		b = btrfs_root_node(root);
	else
		b = btrfs_lock_root_node(root);
1374

1375
	while (b) {
1376
		level = btrfs_header_level(b);
1377 1378 1379 1380 1381 1382 1383 1384 1385

		/*
		 * setup the path here so we can release it under lock
		 * contention with the cow code
		 */
		p->nodes[level] = b;
		if (!p->skip_locking)
			p->locks[level] = 1;

C
Chris Mason 已提交
1386 1387
		if (cow) {
			int wret;
1388 1389 1390 1391

			/* is a cow on this block not required */
			spin_lock(&root->fs_info->hash_lock);
			if (btrfs_header_generation(b) == trans->transid &&
1392
			    btrfs_header_owner(b) == root->root_key.objectid &&
1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427
			    !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
				spin_unlock(&root->fs_info->hash_lock);
				goto cow_done;
			}
			spin_unlock(&root->fs_info->hash_lock);

			/* ok, we have to cow, is our old prealloc the right
			 * size?
			 */
			if (prealloc_block.objectid &&
			    prealloc_block.offset != b->len) {
				btrfs_free_reserved_extent(root,
					   prealloc_block.objectid,
					   prealloc_block.offset);
				prealloc_block.objectid = 0;
			}

			/*
			 * for higher level blocks, try not to allocate blocks
			 * with the block and the parent locks held.
			 */
			if (level > 1 && !prealloc_block.objectid &&
			    btrfs_path_lock_waiting(p, level)) {
				u32 size = b->len;
				u64 hint = b->start;

				btrfs_release_path(root, p);
				ret = btrfs_reserve_extent(trans, root,
							   size, size, 0,
							   hint, (u64)-1,
							   &prealloc_block, 0);
				BUG_ON(ret);
				goto again;
			}

C
Chris Mason 已提交
1428 1429 1430
			wret = btrfs_cow_block(trans, root, b,
					       p->nodes[level + 1],
					       p->slots[level + 1],
1431 1432
					       &b, prealloc_block.objectid);
			prealloc_block.objectid = 0;
1433
			if (wret) {
1434
				free_extent_buffer(b);
1435 1436
				ret = wret;
				goto done;
1437
			}
C
Chris Mason 已提交
1438
		}
1439
cow_done:
C
Chris Mason 已提交
1440
		BUG_ON(!cow && ins_len);
1441
		if (level != btrfs_header_level(b))
C
Chris Mason 已提交
1442
			WARN_ON(1);
1443
		level = btrfs_header_level(b);
1444

1445
		p->nodes[level] = b;
1446 1447
		if (!p->skip_locking)
			p->locks[level] = 1;
1448

C
Chris Mason 已提交
1449
		ret = check_block(root, p, level);
1450 1451 1452 1453
		if (ret) {
			ret = -1;
			goto done;
		}
1454

1455 1456
		ret = bin_search(b, key, level, &slot);
		if (level != 0) {
1457 1458 1459
			if (ret && slot > 0)
				slot -= 1;
			p->slots[level] = slot;
1460 1461
			if ((p->search_for_split || ins_len > 0) &&
			    btrfs_header_nritems(b) >=
1462
			    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1463
				int sret = split_node(trans, root, p, level);
C
Chris Mason 已提交
1464
				BUG_ON(sret > 0);
1465 1466 1467 1468
				if (sret) {
					ret = sret;
					goto done;
				}
C
Chris Mason 已提交
1469 1470
				b = p->nodes[level];
				slot = p->slots[level];
1471
			} else if (ins_len < 0) {
1472 1473
				int sret = balance_level(trans, root, p,
							 level);
1474 1475 1476 1477
				if (sret) {
					ret = sret;
					goto done;
				}
1478
				b = p->nodes[level];
1479 1480
				if (!b) {
					btrfs_release_path(NULL, p);
1481
					goto again;
1482
				}
1483
				slot = p->slots[level];
1484
				BUG_ON(btrfs_header_nritems(b) == 1);
C
Chris Mason 已提交
1485
			}
1486 1487
			unlock_up(p, level, lowest_unlock);

1488
			/* this is only true while dropping a snapshot */
1489
			if (level == lowest_level) {
1490 1491
				ret = 0;
				goto done;
1492
			}
1493

1494 1495 1496 1497 1498 1499
			blocknr = btrfs_node_blockptr(b, slot);
			gen = btrfs_node_ptr_generation(b, slot);
			blocksize = btrfs_level_size(root, level - 1);

			tmp = btrfs_find_tree_block(root, blocknr, blocksize);
			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1500 1501 1502 1503 1504 1505 1506 1507 1508
				b = tmp;
			} else {
				/*
				 * reduce lock contention at high levels
				 * of the btree by dropping locks before
				 * we read.
				 */
				if (level > 1) {
					btrfs_release_path(NULL, p);
1509 1510
					if (tmp)
						free_extent_buffer(tmp);
1511 1512 1513 1514 1515
					if (should_reada)
						reada_for_search(root, p,
								 level, slot,
								 key->objectid);

1516 1517
					tmp = read_tree_block(root, blocknr,
							 blocksize, gen);
1518 1519 1520 1521
					if (tmp)
						free_extent_buffer(tmp);
					goto again;
				} else {
1522 1523
					if (tmp)
						free_extent_buffer(tmp);
1524 1525 1526 1527
					if (should_reada)
						reada_for_search(root, p,
								 level, slot,
								 key->objectid);
1528 1529 1530
					b = read_node_slot(root, b, slot);
				}
			}
1531 1532
			if (!p->skip_locking)
				btrfs_tree_lock(b);
1533 1534
		} else {
			p->slots[level] = slot;
1535 1536
			if (ins_len > 0 &&
			    btrfs_leaf_free_space(root, b) < ins_len) {
1537
				int sret = split_leaf(trans, root, key,
1538
						      p, ins_len, ret == 0);
C
Chris Mason 已提交
1539
				BUG_ON(sret > 0);
1540 1541 1542 1543
				if (sret) {
					ret = sret;
					goto done;
				}
C
Chris Mason 已提交
1544
			}
1545 1546
			if (!p->search_for_split)
				unlock_up(p, level, lowest_unlock);
1547
			goto done;
1548 1549
		}
	}
1550 1551 1552 1553 1554 1555 1556 1557 1558
	ret = 1;
done:
	if (prealloc_block.objectid) {
		btrfs_free_reserved_extent(root,
			   prealloc_block.objectid,
			   prealloc_block.offset);
	}

	return ret;
1559 1560
}

Z
Zheng Yan 已提交
1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600
int btrfs_merge_path(struct btrfs_trans_handle *trans,
		     struct btrfs_root *root,
		     struct btrfs_key *node_keys,
		     u64 *nodes, int lowest_level)
{
	struct extent_buffer *eb;
	struct extent_buffer *parent;
	struct btrfs_key key;
	u64 bytenr;
	u64 generation;
	u32 blocksize;
	int level;
	int slot;
	int key_match;
	int ret;

	eb = btrfs_lock_root_node(root);
	ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
	BUG_ON(ret);

	parent = eb;
	while (1) {
		level = btrfs_header_level(parent);
		if (level == 0 || level <= lowest_level)
			break;

		ret = bin_search(parent, &node_keys[lowest_level], level,
				 &slot);
		if (ret && slot > 0)
			slot--;

		bytenr = btrfs_node_blockptr(parent, slot);
		if (nodes[level - 1] == bytenr)
			break;

		blocksize = btrfs_level_size(root, level - 1);
		generation = btrfs_node_ptr_generation(parent, slot);
		btrfs_node_key_to_cpu(eb, &key, slot);
		key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));

Y
Yan Zheng 已提交
1601 1602 1603 1604 1605 1606
		if (generation == trans->transid) {
			eb = read_tree_block(root, bytenr, blocksize,
					     generation);
			btrfs_tree_lock(eb);
		}

Z
Zheng Yan 已提交
1607 1608 1609
		/*
		 * if node keys match and node pointer hasn't been modified
		 * in the running transaction, we can merge the path. for
Y
Yan Zheng 已提交
1610 1611 1612
		 * blocks owened by reloc trees, the node pointer check is
		 * skipped, this is because these blocks are fully controlled
		 * by the space balance code, no one else can modify them.
Z
Zheng Yan 已提交
1613 1614 1615
		 */
		if (!nodes[level - 1] || !key_match ||
		    (generation == trans->transid &&
Y
Yan Zheng 已提交
1616 1617 1618 1619 1620 1621
		     btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
			if (level == 1 || level == lowest_level + 1) {
				if (generation == trans->transid) {
					btrfs_tree_unlock(eb);
					free_extent_buffer(eb);
				}
Z
Zheng Yan 已提交
1622
				break;
Y
Yan Zheng 已提交
1623
			}
Z
Zheng Yan 已提交
1624

Y
Yan Zheng 已提交
1625 1626 1627 1628 1629
			if (generation != trans->transid) {
				eb = read_tree_block(root, bytenr, blocksize,
						generation);
				btrfs_tree_lock(eb);
			}
Z
Zheng Yan 已提交
1630 1631 1632 1633 1634

			ret = btrfs_cow_block(trans, root, eb, parent, slot,
					      &eb, 0);
			BUG_ON(ret);

Y
Yan Zheng 已提交
1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645
			if (root->root_key.objectid ==
			    BTRFS_TREE_RELOC_OBJECTID) {
				if (!nodes[level - 1]) {
					nodes[level - 1] = eb->start;
					memcpy(&node_keys[level - 1], &key,
					       sizeof(node_keys[0]));
				} else {
					WARN_ON(1);
				}
			}

Z
Zheng Yan 已提交
1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660
			btrfs_tree_unlock(parent);
			free_extent_buffer(parent);
			parent = eb;
			continue;
		}

		btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
		btrfs_set_node_ptr_generation(parent, slot, trans->transid);
		btrfs_mark_buffer_dirty(parent);

		ret = btrfs_inc_extent_ref(trans, root,
					nodes[level - 1],
					blocksize, parent->start,
					btrfs_header_owner(parent),
					btrfs_header_generation(parent),
1661
					level - 1);
Z
Zheng Yan 已提交
1662 1663
		BUG_ON(ret);

Y
Yan Zheng 已提交
1664 1665 1666 1667 1668
		/*
		 * If the block was created in the running transaction,
		 * it's possible this is the last reference to it, so we
		 * should drop the subtree.
		 */
Z
Zheng Yan 已提交
1669
		if (generation == trans->transid) {
Y
Yan Zheng 已提交
1670 1671
			ret = btrfs_drop_subtree(trans, root, eb, parent);
			BUG_ON(ret);
Z
Zheng Yan 已提交
1672 1673
			btrfs_tree_unlock(eb);
			free_extent_buffer(eb);
Y
Yan Zheng 已提交
1674 1675 1676 1677 1678 1679 1680
		} else {
			ret = btrfs_free_extent(trans, root, bytenr,
					blocksize, parent->start,
					btrfs_header_owner(parent),
					btrfs_header_generation(parent),
					level - 1, 1);
			BUG_ON(ret);
Z
Zheng Yan 已提交
1681 1682 1683 1684 1685 1686 1687 1688
		}
		break;
	}
	btrfs_tree_unlock(parent);
	free_extent_buffer(parent);
	return 0;
}

C
Chris Mason 已提交
1689 1690 1691 1692 1693 1694
/*
 * adjust the pointers going up the tree, starting at level
 * making sure the right key of each node is points to 'key'.
 * This is used after shifting pointers to the left, so it stops
 * fixing up pointers when a given leaf/node is not in slot 0 of the
 * higher levels
C
Chris Mason 已提交
1695 1696 1697
 *
 * If this fails to write a tree block, it returns -1, but continues
 * fixing up the blocks in ram so the tree is consistent.
C
Chris Mason 已提交
1698
 */
1699 1700 1701
static int fixup_low_keys(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root, struct btrfs_path *path,
			  struct btrfs_disk_key *key, int level)
1702 1703
{
	int i;
C
Chris Mason 已提交
1704
	int ret = 0;
1705 1706
	struct extent_buffer *t;

C
Chris Mason 已提交
1707
	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1708
		int tslot = path->slots[i];
1709
		if (!path->nodes[i])
1710
			break;
1711 1712
		t = path->nodes[i];
		btrfs_set_node_key(t, key, tslot);
C
Chris Mason 已提交
1713
		btrfs_mark_buffer_dirty(path->nodes[i]);
1714 1715 1716
		if (tslot != 0)
			break;
	}
C
Chris Mason 已提交
1717
	return ret;
1718 1719
}

Z
Zheng Yan 已提交
1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754
/*
 * update item key.
 *
 * This function isn't completely safe. It's the caller's responsibility
 * that the new key won't break the order
 */
int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root, struct btrfs_path *path,
			    struct btrfs_key *new_key)
{
	struct btrfs_disk_key disk_key;
	struct extent_buffer *eb;
	int slot;

	eb = path->nodes[0];
	slot = path->slots[0];
	if (slot > 0) {
		btrfs_item_key(eb, &disk_key, slot - 1);
		if (comp_keys(&disk_key, new_key) >= 0)
			return -1;
	}
	if (slot < btrfs_header_nritems(eb) - 1) {
		btrfs_item_key(eb, &disk_key, slot + 1);
		if (comp_keys(&disk_key, new_key) <= 0)
			return -1;
	}

	btrfs_cpu_key_to_disk(&disk_key, new_key);
	btrfs_set_item_key(eb, &disk_key, slot);
	btrfs_mark_buffer_dirty(eb);
	if (slot == 0)
		fixup_low_keys(trans, root, path, &disk_key, 1);
	return 0;
}

C
Chris Mason 已提交
1755 1756
/*
 * try to push data from one node into the next node left in the
1757
 * tree.
C
Chris Mason 已提交
1758 1759 1760
 *
 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
 * error, and > 0 if there was no room in the left hand block.
C
Chris Mason 已提交
1761
 */
1762 1763
static int push_node_left(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root, struct extent_buffer *dst,
1764
			  struct extent_buffer *src, int empty)
1765 1766
{
	int push_items = 0;
1767 1768
	int src_nritems;
	int dst_nritems;
C
Chris Mason 已提交
1769
	int ret = 0;
1770

1771 1772
	src_nritems = btrfs_header_nritems(src);
	dst_nritems = btrfs_header_nritems(dst);
C
Chris Mason 已提交
1773
	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1774 1775
	WARN_ON(btrfs_header_generation(src) != trans->transid);
	WARN_ON(btrfs_header_generation(dst) != trans->transid);
1776

1777
	if (!empty && src_nritems <= 8)
1778 1779
		return 1;

C
Chris Mason 已提交
1780
	if (push_items <= 0)
1781 1782
		return 1;

1783
	if (empty) {
1784
		push_items = min(src_nritems, push_items);
1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796
		if (push_items < src_nritems) {
			/* leave at least 8 pointers in the node if
			 * we aren't going to empty it
			 */
			if (src_nritems - push_items < 8) {
				if (push_items <= 8)
					return 1;
				push_items -= 8;
			}
		}
	} else
		push_items = min(src_nritems - 8, push_items);
1797

1798 1799 1800
	copy_extent_buffer(dst, src,
			   btrfs_node_key_ptr_offset(dst_nritems),
			   btrfs_node_key_ptr_offset(0),
C
Chris Mason 已提交
1801
			   push_items * sizeof(struct btrfs_key_ptr));
1802

1803
	if (push_items < src_nritems) {
1804 1805 1806 1807 1808 1809 1810 1811 1812
		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
				      btrfs_node_key_ptr_offset(push_items),
				      (src_nritems - push_items) *
				      sizeof(struct btrfs_key_ptr));
	}
	btrfs_set_header_nritems(src, src_nritems - push_items);
	btrfs_set_header_nritems(dst, dst_nritems + push_items);
	btrfs_mark_buffer_dirty(src);
	btrfs_mark_buffer_dirty(dst);
Z
Zheng Yan 已提交
1813 1814 1815 1816

	ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
	BUG_ON(ret);

1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828
	return ret;
}

/*
 * try to push data from one node into the next node right in the
 * tree.
 *
 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
 * error, and > 0 if there was no room in the right hand block.
 *
 * this will  only push up to 1/2 the contents of the left node over
 */
1829 1830 1831 1832
static int balance_node_right(struct btrfs_trans_handle *trans,
			      struct btrfs_root *root,
			      struct extent_buffer *dst,
			      struct extent_buffer *src)
1833 1834 1835 1836 1837 1838 1839
{
	int push_items = 0;
	int max_push;
	int src_nritems;
	int dst_nritems;
	int ret = 0;

1840 1841 1842
	WARN_ON(btrfs_header_generation(src) != trans->transid);
	WARN_ON(btrfs_header_generation(dst) != trans->transid);

1843 1844
	src_nritems = btrfs_header_nritems(src);
	dst_nritems = btrfs_header_nritems(dst);
C
Chris Mason 已提交
1845
	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
C
Chris Mason 已提交
1846
	if (push_items <= 0)
1847
		return 1;
1848

C
Chris Mason 已提交
1849
	if (src_nritems < 4)
1850
		return 1;
1851 1852 1853

	max_push = src_nritems / 2 + 1;
	/* don't try to empty the node */
C
Chris Mason 已提交
1854
	if (max_push >= src_nritems)
1855
		return 1;
Y
Yan 已提交
1856

1857 1858 1859
	if (max_push < push_items)
		push_items = max_push;

1860 1861 1862 1863
	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
				      btrfs_node_key_ptr_offset(0),
				      (dst_nritems) *
				      sizeof(struct btrfs_key_ptr));
C
Chris Mason 已提交
1864

1865 1866 1867
	copy_extent_buffer(dst, src,
			   btrfs_node_key_ptr_offset(0),
			   btrfs_node_key_ptr_offset(src_nritems - push_items),
C
Chris Mason 已提交
1868
			   push_items * sizeof(struct btrfs_key_ptr));
1869

1870 1871
	btrfs_set_header_nritems(src, src_nritems - push_items);
	btrfs_set_header_nritems(dst, dst_nritems + push_items);
1872

1873 1874
	btrfs_mark_buffer_dirty(src);
	btrfs_mark_buffer_dirty(dst);
Z
Zheng Yan 已提交
1875 1876 1877 1878

	ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
	BUG_ON(ret);

C
Chris Mason 已提交
1879
	return ret;
1880 1881
}

C
Chris Mason 已提交
1882 1883 1884 1885
/*
 * helper function to insert a new root level in the tree.
 * A new node is allocated, and a single item is inserted to
 * point to the existing root
C
Chris Mason 已提交
1886 1887
 *
 * returns zero on success or < 0 on failure.
C
Chris Mason 已提交
1888
 */
C
Chris Mason 已提交
1889
static noinline int insert_new_root(struct btrfs_trans_handle *trans,
1890 1891
			   struct btrfs_root *root,
			   struct btrfs_path *path, int level)
C
Chris Mason 已提交
1892
{
1893
	u64 lower_gen;
1894 1895
	struct extent_buffer *lower;
	struct extent_buffer *c;
1896
	struct extent_buffer *old;
1897
	struct btrfs_disk_key lower_key;
Z
Zheng Yan 已提交
1898
	int ret;
C
Chris Mason 已提交
1899 1900 1901 1902

	BUG_ON(path->nodes[level]);
	BUG_ON(path->nodes[level-1] != root->node);

1903 1904 1905 1906 1907 1908
	lower = path->nodes[level-1];
	if (level == 1)
		btrfs_item_key(lower, &lower_key, 0);
	else
		btrfs_node_key(lower, &lower_key, 0);

Z
Zheng Yan 已提交
1909 1910
	c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
				   root->root_key.objectid, trans->transid,
1911
				   level, root->node->start, 0);
1912 1913
	if (IS_ERR(c))
		return PTR_ERR(c);
1914

1915 1916 1917
	memset_extent_buffer(c, 0, 0, root->nodesize);
	btrfs_set_header_nritems(c, 1);
	btrfs_set_header_level(c, level);
1918
	btrfs_set_header_bytenr(c, c->start);
1919 1920 1921 1922 1923 1924
	btrfs_set_header_generation(c, trans->transid);
	btrfs_set_header_owner(c, root->root_key.objectid);

	write_extent_buffer(c, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(c),
			    BTRFS_FSID_SIZE);
1925 1926 1927 1928 1929

	write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
			    (unsigned long)btrfs_header_chunk_tree_uuid(c),
			    BTRFS_UUID_SIZE);

1930
	btrfs_set_node_key(c, &lower_key, 0);
1931
	btrfs_set_node_blockptr(c, 0, lower->start);
1932
	lower_gen = btrfs_header_generation(lower);
Z
Zheng Yan 已提交
1933
	WARN_ON(lower_gen != trans->transid);
1934 1935

	btrfs_set_node_ptr_generation(c, 0, lower_gen);
1936

1937
	btrfs_mark_buffer_dirty(c);
1938

1939 1940
	spin_lock(&root->node_lock);
	old = root->node;
1941
	root->node = c;
1942 1943
	spin_unlock(&root->node_lock);

Z
Zheng Yan 已提交
1944 1945 1946
	ret = btrfs_update_extent_ref(trans, root, lower->start,
				      lower->start, c->start,
				      root->root_key.objectid,
1947
				      trans->transid, level - 1);
Z
Zheng Yan 已提交
1948 1949
	BUG_ON(ret);

1950 1951 1952
	/* the super has an extra ref to root->node */
	free_extent_buffer(old);

1953
	add_root_to_dirty_list(root);
1954 1955
	extent_buffer_get(c);
	path->nodes[level] = c;
1956
	path->locks[level] = 1;
C
Chris Mason 已提交
1957 1958 1959 1960
	path->slots[level] = 0;
	return 0;
}

C
Chris Mason 已提交
1961 1962 1963
/*
 * worker function to insert a single pointer in a node.
 * the node should have enough room for the pointer already
C
Chris Mason 已提交
1964
 *
C
Chris Mason 已提交
1965 1966
 * slot and level indicate where you want the key to go, and
 * blocknr is the block the key points to.
C
Chris Mason 已提交
1967 1968
 *
 * returns zero on success and < 0 on any error
C
Chris Mason 已提交
1969
 */
1970 1971
static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
		      *root, struct btrfs_path *path, struct btrfs_disk_key
1972
		      *key, u64 bytenr, int slot, int level)
C
Chris Mason 已提交
1973
{
1974
	struct extent_buffer *lower;
C
Chris Mason 已提交
1975
	int nritems;
C
Chris Mason 已提交
1976 1977

	BUG_ON(!path->nodes[level]);
1978 1979
	lower = path->nodes[level];
	nritems = btrfs_header_nritems(lower);
C
Chris Mason 已提交
1980 1981
	if (slot > nritems)
		BUG();
C
Chris Mason 已提交
1982
	if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
C
Chris Mason 已提交
1983 1984
		BUG();
	if (slot != nritems) {
1985 1986 1987
		memmove_extent_buffer(lower,
			      btrfs_node_key_ptr_offset(slot + 1),
			      btrfs_node_key_ptr_offset(slot),
C
Chris Mason 已提交
1988
			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
C
Chris Mason 已提交
1989
	}
1990
	btrfs_set_node_key(lower, key, slot);
1991
	btrfs_set_node_blockptr(lower, slot, bytenr);
1992 1993
	WARN_ON(trans->transid == 0);
	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
1994 1995
	btrfs_set_header_nritems(lower, nritems + 1);
	btrfs_mark_buffer_dirty(lower);
C
Chris Mason 已提交
1996 1997 1998
	return 0;
}

C
Chris Mason 已提交
1999 2000 2001 2002 2003 2004
/*
 * split the node at the specified level in path in two.
 * The path is corrected to point to the appropriate node after the split
 *
 * Before splitting this tries to make some room in the node by pushing
 * left and right, if either one works, it returns right away.
C
Chris Mason 已提交
2005 2006
 *
 * returns 0 on success and < 0 on failure
C
Chris Mason 已提交
2007
 */
2008 2009 2010
static noinline int split_node(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root,
			       struct btrfs_path *path, int level)
2011
{
2012 2013 2014
	struct extent_buffer *c;
	struct extent_buffer *split;
	struct btrfs_disk_key disk_key;
2015
	int mid;
C
Chris Mason 已提交
2016
	int ret;
C
Chris Mason 已提交
2017
	int wret;
2018
	u32 c_nritems;
2019

2020
	c = path->nodes[level];
2021
	WARN_ON(btrfs_header_generation(c) != trans->transid);
2022
	if (c == root->node) {
C
Chris Mason 已提交
2023
		/* trying to split the root, lets make a new one */
2024
		ret = insert_new_root(trans, root, path, level + 1);
C
Chris Mason 已提交
2025 2026
		if (ret)
			return ret;
2027 2028
	} else {
		ret = push_nodes_for_insert(trans, root, path, level);
2029 2030
		c = path->nodes[level];
		if (!ret && btrfs_header_nritems(c) <
2031
		    BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
2032
			return 0;
2033 2034
		if (ret < 0)
			return ret;
2035
	}
2036

2037
	c_nritems = btrfs_header_nritems(c);
2038

2039
	split = btrfs_alloc_free_block(trans, root, root->nodesize,
Z
Zheng Yan 已提交
2040 2041 2042
					path->nodes[level + 1]->start,
					root->root_key.objectid,
					trans->transid, level, c->start, 0);
2043 2044 2045 2046 2047
	if (IS_ERR(split))
		return PTR_ERR(split);

	btrfs_set_header_flags(split, btrfs_header_flags(c));
	btrfs_set_header_level(split, btrfs_header_level(c));
2048
	btrfs_set_header_bytenr(split, split->start);
2049 2050
	btrfs_set_header_generation(split, trans->transid);
	btrfs_set_header_owner(split, root->root_key.objectid);
2051
	btrfs_set_header_flags(split, 0);
2052 2053 2054
	write_extent_buffer(split, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(split),
			    BTRFS_FSID_SIZE);
2055 2056 2057
	write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
			    (unsigned long)btrfs_header_chunk_tree_uuid(split),
			    BTRFS_UUID_SIZE);
2058

2059
	mid = (c_nritems + 1) / 2;
2060 2061 2062 2063 2064 2065 2066

	copy_extent_buffer(split, c,
			   btrfs_node_key_ptr_offset(0),
			   btrfs_node_key_ptr_offset(mid),
			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
	btrfs_set_header_nritems(split, c_nritems - mid);
	btrfs_set_header_nritems(c, mid);
C
Chris Mason 已提交
2067 2068
	ret = 0;

2069 2070 2071 2072
	btrfs_mark_buffer_dirty(c);
	btrfs_mark_buffer_dirty(split);

	btrfs_node_key(split, &disk_key, 0);
2073
	wret = insert_ptr(trans, root, path, &disk_key, split->start,
2074
			  path->slots[level + 1] + 1,
C
Chris Mason 已提交
2075
			  level + 1);
C
Chris Mason 已提交
2076 2077 2078
	if (wret)
		ret = wret;

Z
Zheng Yan 已提交
2079 2080 2081
	ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
	BUG_ON(ret);

C
Chris Mason 已提交
2082
	if (path->slots[level] >= mid) {
C
Chris Mason 已提交
2083
		path->slots[level] -= mid;
2084
		btrfs_tree_unlock(c);
2085 2086
		free_extent_buffer(c);
		path->nodes[level] = split;
C
Chris Mason 已提交
2087 2088
		path->slots[level + 1] += 1;
	} else {
2089
		btrfs_tree_unlock(split);
2090
		free_extent_buffer(split);
2091
	}
C
Chris Mason 已提交
2092
	return ret;
2093 2094
}

C
Chris Mason 已提交
2095 2096 2097 2098 2099
/*
 * how many bytes are required to store the items in a leaf.  start
 * and nr indicate which items in the leaf to check.  This totals up the
 * space used both by the item structs and the item data
 */
2100
static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2101 2102
{
	int data_len;
2103
	int nritems = btrfs_header_nritems(l);
2104
	int end = min(nritems, start + nr) - 1;
2105 2106 2107

	if (!nr)
		return 0;
2108 2109
	data_len = btrfs_item_end_nr(l, start);
	data_len = data_len - btrfs_item_offset_nr(l, end);
C
Chris Mason 已提交
2110
	data_len += sizeof(struct btrfs_item) * nr;
2111
	WARN_ON(data_len < 0);
2112 2113 2114
	return data_len;
}

2115 2116 2117 2118 2119
/*
 * The space between the end of the leaf items and
 * the start of the leaf data.  IOW, how much room
 * the leaf has left for both items and data
 */
C
Chris Mason 已提交
2120
noinline int btrfs_leaf_free_space(struct btrfs_root *root,
2121
				   struct extent_buffer *leaf)
2122
{
2123 2124 2125 2126
	int nritems = btrfs_header_nritems(leaf);
	int ret;
	ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
	if (ret < 0) {
C
Chris Mason 已提交
2127 2128
		printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
		       "used %d nritems %d\n",
J
Jens Axboe 已提交
2129
		       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
2130 2131 2132
		       leaf_space_used(leaf, 0, nritems), nritems);
	}
	return ret;
2133 2134
}

C
Chris Mason 已提交
2135 2136 2137
/*
 * push some data in the path leaf to the right, trying to free up at
 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
C
Chris Mason 已提交
2138 2139 2140
 *
 * returns 1 if the push failed because the other node didn't have enough
 * room, 0 if everything worked out and < 0 if there were major errors.
C
Chris Mason 已提交
2141
 */
2142
static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2143 2144
			   *root, struct btrfs_path *path, int data_size,
			   int empty)
C
Chris Mason 已提交
2145
{
2146 2147 2148 2149
	struct extent_buffer *left = path->nodes[0];
	struct extent_buffer *right;
	struct extent_buffer *upper;
	struct btrfs_disk_key disk_key;
C
Chris Mason 已提交
2150
	int slot;
2151
	u32 i;
C
Chris Mason 已提交
2152 2153 2154
	int free_space;
	int push_space = 0;
	int push_items = 0;
C
Chris Mason 已提交
2155
	struct btrfs_item *item;
2156
	u32 left_nritems;
2157
	u32 nr;
2158
	u32 right_nritems;
2159
	u32 data_end;
2160
	u32 this_item_size;
2161
	int ret;
C
Chris Mason 已提交
2162 2163

	slot = path->slots[1];
C
Chris Mason 已提交
2164
	if (!path->nodes[1])
C
Chris Mason 已提交
2165
		return 1;
C
Chris Mason 已提交
2166

C
Chris Mason 已提交
2167
	upper = path->nodes[1];
2168
	if (slot >= btrfs_header_nritems(upper) - 1)
C
Chris Mason 已提交
2169
		return 1;
2170

2171 2172
	WARN_ON(!btrfs_tree_locked(path->nodes[1]));

2173
	right = read_node_slot(root, upper, slot + 1);
2174
	btrfs_tree_lock(right);
C
Chris Mason 已提交
2175
	free_space = btrfs_leaf_free_space(root, right);
2176
	if (free_space < data_size)
2177
		goto out_unlock;
2178

C
Chris Mason 已提交
2179
	/* cow and double check */
2180
	ret = btrfs_cow_block(trans, root, right, upper,
2181
			      slot + 1, &right, 0);
2182 2183 2184
	if (ret)
		goto out_unlock;

C
Chris Mason 已提交
2185
	free_space = btrfs_leaf_free_space(root, right);
2186
	if (free_space < data_size)
2187
		goto out_unlock;
C
Chris Mason 已提交
2188

2189
	left_nritems = btrfs_header_nritems(left);
2190 2191
	if (left_nritems == 0)
		goto out_unlock;
2192

2193 2194 2195 2196 2197
	if (empty)
		nr = 0;
	else
		nr = 1;

Z
Zheng Yan 已提交
2198
	if (path->slots[0] >= left_nritems)
2199
		push_space += data_size;
Z
Zheng Yan 已提交
2200

2201 2202
	i = left_nritems - 1;
	while (i >= nr) {
2203
		item = btrfs_item_nr(left, i);
2204

Z
Zheng Yan 已提交
2205 2206 2207 2208 2209 2210 2211 2212 2213 2214
		if (!empty && push_items > 0) {
			if (path->slots[0] > i)
				break;
			if (path->slots[0] == i) {
				int space = btrfs_leaf_free_space(root, left);
				if (space + push_space * 2 > free_space)
					break;
			}
		}

C
Chris Mason 已提交
2215
		if (path->slots[0] == i)
2216
			push_space += data_size;
2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227

		if (!left->map_token) {
			map_extent_buffer(left, (unsigned long)item,
					sizeof(struct btrfs_item),
					&left->map_token, &left->kaddr,
					&left->map_start, &left->map_len,
					KM_USER1);
		}

		this_item_size = btrfs_item_size(left, item);
		if (this_item_size + sizeof(*item) + push_space > free_space)
C
Chris Mason 已提交
2228
			break;
Z
Zheng Yan 已提交
2229

C
Chris Mason 已提交
2230
		push_items++;
2231
		push_space += this_item_size + sizeof(*item);
2232 2233 2234
		if (i == 0)
			break;
		i--;
2235 2236 2237 2238
	}
	if (left->map_token) {
		unmap_extent_buffer(left, left->map_token, KM_USER1);
		left->map_token = NULL;
C
Chris Mason 已提交
2239
	}
2240

2241 2242
	if (push_items == 0)
		goto out_unlock;
2243

2244
	if (!empty && push_items == left_nritems)
2245
		WARN_ON(1);
2246

C
Chris Mason 已提交
2247
	/* push left to right */
2248
	right_nritems = btrfs_header_nritems(right);
2249

2250
	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
C
Chris Mason 已提交
2251
	push_space -= leaf_data_end(root, left);
2252

C
Chris Mason 已提交
2253
	/* make room in the right data area */
2254 2255 2256 2257 2258 2259
	data_end = leaf_data_end(root, right);
	memmove_extent_buffer(right,
			      btrfs_leaf_data(right) + data_end - push_space,
			      btrfs_leaf_data(right) + data_end,
			      BTRFS_LEAF_DATA_SIZE(root) - data_end);

C
Chris Mason 已提交
2260
	/* copy from the left data area */
2261
	copy_extent_buffer(right, left, btrfs_leaf_data(right) +
C
Chris Mason 已提交
2262 2263 2264
		     BTRFS_LEAF_DATA_SIZE(root) - push_space,
		     btrfs_leaf_data(left) + leaf_data_end(root, left),
		     push_space);
2265 2266 2267 2268 2269

	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
			      btrfs_item_nr_offset(0),
			      right_nritems * sizeof(struct btrfs_item));

C
Chris Mason 已提交
2270
	/* copy the items from left to right */
2271 2272 2273
	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
		   btrfs_item_nr_offset(left_nritems - push_items),
		   push_items * sizeof(struct btrfs_item));
C
Chris Mason 已提交
2274 2275

	/* update the item pointers */
2276
	right_nritems += push_items;
2277
	btrfs_set_header_nritems(right, right_nritems);
C
Chris Mason 已提交
2278
	push_space = BTRFS_LEAF_DATA_SIZE(root);
2279
	for (i = 0; i < right_nritems; i++) {
2280
		item = btrfs_item_nr(right, i);
2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294
		if (!right->map_token) {
			map_extent_buffer(right, (unsigned long)item,
					sizeof(struct btrfs_item),
					&right->map_token, &right->kaddr,
					&right->map_start, &right->map_len,
					KM_USER1);
		}
		push_space -= btrfs_item_size(right, item);
		btrfs_set_item_offset(right, item, push_space);
	}

	if (right->map_token) {
		unmap_extent_buffer(right, right->map_token, KM_USER1);
		right->map_token = NULL;
C
Chris Mason 已提交
2295
	}
2296
	left_nritems -= push_items;
2297
	btrfs_set_header_nritems(left, left_nritems);
C
Chris Mason 已提交
2298

2299 2300
	if (left_nritems)
		btrfs_mark_buffer_dirty(left);
2301
	btrfs_mark_buffer_dirty(right);
2302

Z
Zheng Yan 已提交
2303 2304 2305
	ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
	BUG_ON(ret);

2306 2307
	btrfs_item_key(right, &disk_key, 0);
	btrfs_set_node_key(upper, &disk_key, slot + 1);
C
Chris Mason 已提交
2308
	btrfs_mark_buffer_dirty(upper);
C
Chris Mason 已提交
2309

C
Chris Mason 已提交
2310
	/* then fixup the leaf pointer in the path */
2311 2312
	if (path->slots[0] >= left_nritems) {
		path->slots[0] -= left_nritems;
2313 2314 2315
		if (btrfs_header_nritems(path->nodes[0]) == 0)
			clean_tree_block(trans, root, path->nodes[0]);
		btrfs_tree_unlock(path->nodes[0]);
2316 2317
		free_extent_buffer(path->nodes[0]);
		path->nodes[0] = right;
C
Chris Mason 已提交
2318 2319
		path->slots[1] += 1;
	} else {
2320
		btrfs_tree_unlock(right);
2321
		free_extent_buffer(right);
C
Chris Mason 已提交
2322 2323
	}
	return 0;
2324 2325 2326 2327 2328

out_unlock:
	btrfs_tree_unlock(right);
	free_extent_buffer(right);
	return 1;
C
Chris Mason 已提交
2329
}
2330

C
Chris Mason 已提交
2331 2332 2333 2334
/*
 * push some data in the path leaf to the left, trying to free up at
 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
 */
2335
static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2336 2337
			  *root, struct btrfs_path *path, int data_size,
			  int empty)
2338
{
2339 2340 2341
	struct btrfs_disk_key disk_key;
	struct extent_buffer *right = path->nodes[0];
	struct extent_buffer *left;
2342 2343 2344 2345 2346
	int slot;
	int i;
	int free_space;
	int push_space = 0;
	int push_items = 0;
C
Chris Mason 已提交
2347
	struct btrfs_item *item;
2348
	u32 old_left_nritems;
2349
	u32 right_nritems;
2350
	u32 nr;
C
Chris Mason 已提交
2351 2352
	int ret = 0;
	int wret;
2353 2354
	u32 this_item_size;
	u32 old_left_item_size;
2355 2356

	slot = path->slots[1];
2357
	if (slot == 0)
2358
		return 1;
2359
	if (!path->nodes[1])
2360
		return 1;
2361

2362
	right_nritems = btrfs_header_nritems(right);
C
Chris Mason 已提交
2363
	if (right_nritems == 0)
2364 2365
		return 1;

2366 2367
	WARN_ON(!btrfs_tree_locked(path->nodes[1]));

2368
	left = read_node_slot(root, path->nodes[1], slot - 1);
2369
	btrfs_tree_lock(left);
C
Chris Mason 已提交
2370
	free_space = btrfs_leaf_free_space(root, left);
2371
	if (free_space < data_size) {
2372 2373
		ret = 1;
		goto out;
2374
	}
C
Chris Mason 已提交
2375 2376

	/* cow and double check */
2377
	ret = btrfs_cow_block(trans, root, left,
2378
			      path->nodes[1], slot - 1, &left, 0);
2379 2380
	if (ret) {
		/* we hit -ENOSPC, but it isn't fatal here */
2381 2382
		ret = 1;
		goto out;
2383
	}
2384

C
Chris Mason 已提交
2385
	free_space = btrfs_leaf_free_space(root, left);
2386
	if (free_space < data_size) {
2387 2388
		ret = 1;
		goto out;
C
Chris Mason 已提交
2389 2390
	}

2391 2392 2393 2394 2395 2396
	if (empty)
		nr = right_nritems;
	else
		nr = right_nritems - 1;

	for (i = 0; i < nr; i++) {
2397
		item = btrfs_item_nr(right, i);
2398 2399 2400 2401 2402 2403 2404 2405
		if (!right->map_token) {
			map_extent_buffer(right, (unsigned long)item,
					sizeof(struct btrfs_item),
					&right->map_token, &right->kaddr,
					&right->map_start, &right->map_len,
					KM_USER1);
		}

Z
Zheng Yan 已提交
2406 2407 2408 2409 2410 2411 2412 2413 2414 2415
		if (!empty && push_items > 0) {
			if (path->slots[0] < i)
				break;
			if (path->slots[0] == i) {
				int space = btrfs_leaf_free_space(root, right);
				if (space + push_space * 2 > free_space)
					break;
			}
		}

2416
		if (path->slots[0] == i)
2417
			push_space += data_size;
2418 2419 2420

		this_item_size = btrfs_item_size(right, item);
		if (this_item_size + sizeof(*item) + push_space > free_space)
2421
			break;
2422

2423
		push_items++;
2424 2425 2426 2427 2428 2429
		push_space += this_item_size + sizeof(*item);
	}

	if (right->map_token) {
		unmap_extent_buffer(right, right->map_token, KM_USER1);
		right->map_token = NULL;
2430
	}
2431

2432
	if (push_items == 0) {
2433 2434
		ret = 1;
		goto out;
2435
	}
2436
	if (!empty && push_items == btrfs_header_nritems(right))
2437
		WARN_ON(1);
2438

2439
	/* push data from right to left */
2440 2441 2442 2443 2444
	copy_extent_buffer(left, right,
			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
			   btrfs_item_nr_offset(0),
			   push_items * sizeof(struct btrfs_item));

C
Chris Mason 已提交
2445
	push_space = BTRFS_LEAF_DATA_SIZE(root) -
C
Chris Mason 已提交
2446
		     btrfs_item_offset_nr(right, push_items - 1);
2447 2448

	copy_extent_buffer(left, right, btrfs_leaf_data(left) +
C
Chris Mason 已提交
2449 2450
		     leaf_data_end(root, left) - push_space,
		     btrfs_leaf_data(right) +
2451
		     btrfs_item_offset_nr(right, push_items - 1),
C
Chris Mason 已提交
2452
		     push_space);
2453
	old_left_nritems = btrfs_header_nritems(left);
2454
	BUG_ON(old_left_nritems <= 0);
2455

2456
	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
C
Chris Mason 已提交
2457
	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2458
		u32 ioff;
2459

2460
		item = btrfs_item_nr(left, i);
2461 2462 2463 2464 2465 2466 2467 2468
		if (!left->map_token) {
			map_extent_buffer(left, (unsigned long)item,
					sizeof(struct btrfs_item),
					&left->map_token, &left->kaddr,
					&left->map_start, &left->map_len,
					KM_USER1);
		}

2469 2470
		ioff = btrfs_item_offset(left, item);
		btrfs_set_item_offset(left, item,
2471
		      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
2472
	}
2473
	btrfs_set_header_nritems(left, old_left_nritems + push_items);
2474 2475 2476 2477
	if (left->map_token) {
		unmap_extent_buffer(left, left->map_token, KM_USER1);
		left->map_token = NULL;
	}
2478 2479

	/* fixup right node */
2480
	if (push_items > right_nritems) {
C
Chris Mason 已提交
2481 2482
		printk(KERN_CRIT "push items %d nr %u\n", push_items,
		       right_nritems);
2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494
		WARN_ON(1);
	}

	if (push_items < right_nritems) {
		push_space = btrfs_item_offset_nr(right, push_items - 1) -
						  leaf_data_end(root, right);
		memmove_extent_buffer(right, btrfs_leaf_data(right) +
				      BTRFS_LEAF_DATA_SIZE(root) - push_space,
				      btrfs_leaf_data(right) +
				      leaf_data_end(root, right), push_space);

		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
2495 2496 2497
			      btrfs_item_nr_offset(push_items),
			     (btrfs_header_nritems(right) - push_items) *
			     sizeof(struct btrfs_item));
2498
	}
2499 2500
	right_nritems -= push_items;
	btrfs_set_header_nritems(right, right_nritems);
C
Chris Mason 已提交
2501
	push_space = BTRFS_LEAF_DATA_SIZE(root);
2502 2503
	for (i = 0; i < right_nritems; i++) {
		item = btrfs_item_nr(right, i);
2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518

		if (!right->map_token) {
			map_extent_buffer(right, (unsigned long)item,
					sizeof(struct btrfs_item),
					&right->map_token, &right->kaddr,
					&right->map_start, &right->map_len,
					KM_USER1);
		}

		push_space = push_space - btrfs_item_size(right, item);
		btrfs_set_item_offset(right, item, push_space);
	}
	if (right->map_token) {
		unmap_extent_buffer(right, right->map_token, KM_USER1);
		right->map_token = NULL;
2519
	}
2520

2521
	btrfs_mark_buffer_dirty(left);
2522 2523
	if (right_nritems)
		btrfs_mark_buffer_dirty(right);
2524

Z
Zheng Yan 已提交
2525 2526 2527 2528
	ret = btrfs_update_ref(trans, root, right, left,
			       old_left_nritems, push_items);
	BUG_ON(ret);

2529 2530
	btrfs_item_key(right, &disk_key, 0);
	wret = fixup_low_keys(trans, root, path, &disk_key, 1);
C
Chris Mason 已提交
2531 2532
	if (wret)
		ret = wret;
2533 2534 2535 2536

	/* then fixup the leaf pointer in the path */
	if (path->slots[0] < push_items) {
		path->slots[0] += old_left_nritems;
2537 2538 2539
		if (btrfs_header_nritems(path->nodes[0]) == 0)
			clean_tree_block(trans, root, path->nodes[0]);
		btrfs_tree_unlock(path->nodes[0]);
2540 2541
		free_extent_buffer(path->nodes[0]);
		path->nodes[0] = left;
2542 2543
		path->slots[1] -= 1;
	} else {
2544
		btrfs_tree_unlock(left);
2545
		free_extent_buffer(left);
2546 2547
		path->slots[0] -= push_items;
	}
2548
	BUG_ON(path->slots[0] < 0);
C
Chris Mason 已提交
2549
	return ret;
2550 2551 2552 2553
out:
	btrfs_tree_unlock(left);
	free_extent_buffer(left);
	return ret;
2554 2555
}

C
Chris Mason 已提交
2556 2557 2558
/*
 * split the path's leaf in two, making sure there is at least data_size
 * available for the resulting leaf level of the path.
C
Chris Mason 已提交
2559 2560
 *
 * returns 0 if all went well and < 0 on failure.
C
Chris Mason 已提交
2561
 */
2562 2563 2564 2565 2566
static noinline int split_leaf(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root,
			       struct btrfs_key *ins_key,
			       struct btrfs_path *path, int data_size,
			       int extend)
2567
{
2568
	struct extent_buffer *l;
2569
	u32 nritems;
2570 2571
	int mid;
	int slot;
2572
	struct extent_buffer *right;
2573 2574 2575
	int data_copy_size;
	int rt_data_off;
	int i;
2576
	int ret = 0;
C
Chris Mason 已提交
2577
	int wret;
2578 2579
	int double_split;
	int num_doubles = 0;
2580
	struct btrfs_disk_key disk_key;
C
Chris Mason 已提交
2581

C
Chris Mason 已提交
2582
	/* first try to make some room by pushing left and right */
2583
	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
2584
		wret = push_leaf_right(trans, root, path, data_size, 0);
C
Chris Mason 已提交
2585
		if (wret < 0)
C
Chris Mason 已提交
2586
			return wret;
2587
		if (wret) {
2588
			wret = push_leaf_left(trans, root, path, data_size, 0);
2589 2590 2591 2592
			if (wret < 0)
				return wret;
		}
		l = path->nodes[0];
C
Chris Mason 已提交
2593

2594
		/* did the pushes work? */
2595
		if (btrfs_leaf_free_space(root, l) >= data_size)
2596
			return 0;
2597
	}
C
Chris Mason 已提交
2598

C
Chris Mason 已提交
2599
	if (!path->nodes[1]) {
2600
		ret = insert_new_root(trans, root, path, 1);
C
Chris Mason 已提交
2601 2602 2603
		if (ret)
			return ret;
	}
2604 2605 2606
again:
	double_split = 0;
	l = path->nodes[0];
2607
	slot = path->slots[0];
2608
	nritems = btrfs_header_nritems(l);
C
Chris Mason 已提交
2609
	mid = (nritems + 1) / 2;
2610

2611
	right = btrfs_alloc_free_block(trans, root, root->leafsize,
Z
Zheng Yan 已提交
2612 2613 2614
					path->nodes[1]->start,
					root->root_key.objectid,
					trans->transid, 0, l->start, 0);
2615 2616
	if (IS_ERR(right)) {
		BUG_ON(1);
2617
		return PTR_ERR(right);
2618
	}
2619 2620

	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2621
	btrfs_set_header_bytenr(right, right->start);
2622 2623 2624 2625 2626 2627
	btrfs_set_header_generation(right, trans->transid);
	btrfs_set_header_owner(right, root->root_key.objectid);
	btrfs_set_header_level(right, 0);
	write_extent_buffer(right, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(right),
			    BTRFS_FSID_SIZE);
2628 2629 2630 2631

	write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
			    (unsigned long)btrfs_header_chunk_tree_uuid(right),
			    BTRFS_UUID_SIZE);
2632 2633
	if (mid <= slot) {
		if (nritems == 1 ||
2634
		    leaf_space_used(l, mid, nritems - mid) + data_size >
2635 2636 2637
			BTRFS_LEAF_DATA_SIZE(root)) {
			if (slot >= nritems) {
				btrfs_cpu_key_to_disk(&disk_key, ins_key);
2638
				btrfs_set_header_nritems(right, 0);
2639
				wret = insert_ptr(trans, root, path,
2640
						  &disk_key, right->start,
2641 2642 2643
						  path->slots[1] + 1, 1);
				if (wret)
					ret = wret;
2644 2645

				btrfs_tree_unlock(path->nodes[0]);
2646 2647
				free_extent_buffer(path->nodes[0]);
				path->nodes[0] = right;
2648 2649
				path->slots[0] = 0;
				path->slots[1] += 1;
2650
				btrfs_mark_buffer_dirty(right);
2651 2652 2653
				return ret;
			}
			mid = slot;
2654 2655
			if (mid != nritems &&
			    leaf_space_used(l, mid, nritems - mid) +
2656
			    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2657 2658
				double_split = 1;
			}
2659 2660
		}
	} else {
2661
		if (leaf_space_used(l, 0, mid) + data_size >
2662
			BTRFS_LEAF_DATA_SIZE(root)) {
2663
			if (!extend && data_size && slot == 0) {
2664
				btrfs_cpu_key_to_disk(&disk_key, ins_key);
2665
				btrfs_set_header_nritems(right, 0);
2666 2667
				wret = insert_ptr(trans, root, path,
						  &disk_key,
2668
						  right->start,
2669
						  path->slots[1], 1);
2670 2671
				if (wret)
					ret = wret;
2672
				btrfs_tree_unlock(path->nodes[0]);
2673 2674
				free_extent_buffer(path->nodes[0]);
				path->nodes[0] = right;
2675
				path->slots[0] = 0;
2676 2677
				if (path->slots[1] == 0) {
					wret = fixup_low_keys(trans, root,
C
Chris Mason 已提交
2678
						      path, &disk_key, 1);
2679 2680 2681
					if (wret)
						ret = wret;
				}
2682
				btrfs_mark_buffer_dirty(right);
2683
				return ret;
2684
			} else if ((extend || !data_size) && slot == 0) {
2685 2686 2687 2688 2689
				mid = 1;
			} else {
				mid = slot;
				if (mid != nritems &&
				    leaf_space_used(l, mid, nritems - mid) +
2690
				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2691 2692
					double_split = 1;
				}
2693
			}
2694 2695
		}
	}
2696 2697 2698 2699 2700 2701 2702 2703 2704
	nritems = nritems - mid;
	btrfs_set_header_nritems(right, nritems);
	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);

	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
			   btrfs_item_nr_offset(mid),
			   nritems * sizeof(struct btrfs_item));

	copy_extent_buffer(right, l,
C
Chris Mason 已提交
2705 2706 2707
		     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
		     data_copy_size, btrfs_leaf_data(l) +
		     leaf_data_end(root, l), data_copy_size);
2708

C
Chris Mason 已提交
2709
	rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2710
		      btrfs_item_end_nr(l, mid);
C
Chris Mason 已提交
2711

2712 2713
	for (i = 0; i < nritems; i++) {
		struct btrfs_item *item = btrfs_item_nr(right, i);
2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724
		u32 ioff;

		if (!right->map_token) {
			map_extent_buffer(right, (unsigned long)item,
					sizeof(struct btrfs_item),
					&right->map_token, &right->kaddr,
					&right->map_start, &right->map_len,
					KM_USER1);
		}

		ioff = btrfs_item_offset(right, item);
2725
		btrfs_set_item_offset(right, item, ioff + rt_data_off);
C
Chris Mason 已提交
2726
	}
C
Chris Mason 已提交
2727

2728 2729 2730 2731 2732
	if (right->map_token) {
		unmap_extent_buffer(right, right->map_token, KM_USER1);
		right->map_token = NULL;
	}

2733
	btrfs_set_header_nritems(l, mid);
C
Chris Mason 已提交
2734
	ret = 0;
2735
	btrfs_item_key(right, &disk_key, 0);
2736 2737
	wret = insert_ptr(trans, root, path, &disk_key, right->start,
			  path->slots[1] + 1, 1);
C
Chris Mason 已提交
2738 2739
	if (wret)
		ret = wret;
2740 2741 2742

	btrfs_mark_buffer_dirty(right);
	btrfs_mark_buffer_dirty(l);
2743
	BUG_ON(path->slots[0] != slot);
2744

Z
Zheng Yan 已提交
2745 2746 2747
	ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
	BUG_ON(ret);

2748
	if (mid <= slot) {
2749
		btrfs_tree_unlock(path->nodes[0]);
2750 2751
		free_extent_buffer(path->nodes[0]);
		path->nodes[0] = right;
2752 2753
		path->slots[0] -= mid;
		path->slots[1] += 1;
2754 2755
	} else {
		btrfs_tree_unlock(right);
2756
		free_extent_buffer(right);
2757
	}
2758

2759
	BUG_ON(path->slots[0] < 0);
2760

2761 2762 2763 2764
	if (double_split) {
		BUG_ON(num_doubles != 0);
		num_doubles++;
		goto again;
2765
	}
2766 2767 2768
	return ret;
}

2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822
/*
 * This function splits a single item into two items,
 * giving 'new_key' to the new item and splitting the
 * old one at split_offset (from the start of the item).
 *
 * The path may be released by this operation.  After
 * the split, the path is pointing to the old item.  The
 * new item is going to be in the same node as the old one.
 *
 * Note, the item being split must be smaller enough to live alone on
 * a tree block with room for one extra struct btrfs_item
 *
 * This allows us to split the item in place, keeping a lock on the
 * leaf the entire time.
 */
int btrfs_split_item(struct btrfs_trans_handle *trans,
		     struct btrfs_root *root,
		     struct btrfs_path *path,
		     struct btrfs_key *new_key,
		     unsigned long split_offset)
{
	u32 item_size;
	struct extent_buffer *leaf;
	struct btrfs_key orig_key;
	struct btrfs_item *item;
	struct btrfs_item *new_item;
	int ret = 0;
	int slot;
	u32 nritems;
	u32 orig_offset;
	struct btrfs_disk_key disk_key;
	char *buf;

	leaf = path->nodes[0];
	btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]);
	if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item))
		goto split;

	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
	btrfs_release_path(root, path);

	path->search_for_split = 1;
	path->keep_locks = 1;

	ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1);
	path->search_for_split = 0;

	/* if our item isn't there or got smaller, return now */
	if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0],
							path->slots[0])) {
		path->keep_locks = 0;
		return -EAGAIN;
	}

2823 2824
	ret = split_leaf(trans, root, &orig_key, path,
			 sizeof(struct btrfs_item), 1);
2825 2826 2827 2828
	path->keep_locks = 0;
	BUG_ON(ret);

	leaf = path->nodes[0];
2829
	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886

split:
	item = btrfs_item_nr(leaf, path->slots[0]);
	orig_offset = btrfs_item_offset(leaf, item);
	item_size = btrfs_item_size(leaf, item);


	buf = kmalloc(item_size, GFP_NOFS);
	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
			    path->slots[0]), item_size);
	slot = path->slots[0] + 1;
	leaf = path->nodes[0];

	nritems = btrfs_header_nritems(leaf);

	if (slot != nritems) {
		/* shift the items */
		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
			      btrfs_item_nr_offset(slot),
			      (nritems - slot) * sizeof(struct btrfs_item));

	}

	btrfs_cpu_key_to_disk(&disk_key, new_key);
	btrfs_set_item_key(leaf, &disk_key, slot);

	new_item = btrfs_item_nr(leaf, slot);

	btrfs_set_item_offset(leaf, new_item, orig_offset);
	btrfs_set_item_size(leaf, new_item, item_size - split_offset);

	btrfs_set_item_offset(leaf, item,
			      orig_offset + item_size - split_offset);
	btrfs_set_item_size(leaf, item, split_offset);

	btrfs_set_header_nritems(leaf, nritems + 1);

	/* write the data for the start of the original item */
	write_extent_buffer(leaf, buf,
			    btrfs_item_ptr_offset(leaf, path->slots[0]),
			    split_offset);

	/* write the data for the new item */
	write_extent_buffer(leaf, buf + split_offset,
			    btrfs_item_ptr_offset(leaf, slot),
			    item_size - split_offset);
	btrfs_mark_buffer_dirty(leaf);

	ret = 0;
	if (btrfs_leaf_free_space(root, leaf) < 0) {
		btrfs_print_leaf(root, leaf);
		BUG();
	}
	kfree(buf);
	return ret;
}

C
Chris Mason 已提交
2887 2888 2889 2890 2891 2892
/*
 * make the item pointed to by the path smaller.  new_size indicates
 * how small to make it, and from_end tells us if we just chop bytes
 * off the end of the item or if we shift the item to chop bytes off
 * the front.
 */
C
Chris Mason 已提交
2893 2894 2895
int btrfs_truncate_item(struct btrfs_trans_handle *trans,
			struct btrfs_root *root,
			struct btrfs_path *path,
2896
			u32 new_size, int from_end)
C
Chris Mason 已提交
2897 2898 2899 2900
{
	int ret = 0;
	int slot;
	int slot_orig;
2901 2902
	struct extent_buffer *leaf;
	struct btrfs_item *item;
C
Chris Mason 已提交
2903 2904 2905 2906 2907 2908 2909 2910
	u32 nritems;
	unsigned int data_end;
	unsigned int old_data_start;
	unsigned int old_size;
	unsigned int size_diff;
	int i;

	slot_orig = path->slots[0];
2911
	leaf = path->nodes[0];
2912 2913 2914 2915 2916
	slot = path->slots[0];

	old_size = btrfs_item_size_nr(leaf, slot);
	if (old_size == new_size)
		return 0;
C
Chris Mason 已提交
2917

2918
	nritems = btrfs_header_nritems(leaf);
C
Chris Mason 已提交
2919 2920
	data_end = leaf_data_end(root, leaf);

2921
	old_data_start = btrfs_item_offset_nr(leaf, slot);
2922

C
Chris Mason 已提交
2923 2924 2925 2926 2927 2928 2929 2930 2931 2932
	size_diff = old_size - new_size;

	BUG_ON(slot < 0);
	BUG_ON(slot >= nritems);

	/*
	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
	 */
	/* first correct the data pointers */
	for (i = slot; i < nritems; i++) {
2933 2934
		u32 ioff;
		item = btrfs_item_nr(leaf, i);
2935 2936 2937 2938 2939 2940 2941 2942 2943

		if (!leaf->map_token) {
			map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
		}

2944 2945
		ioff = btrfs_item_offset(leaf, item);
		btrfs_set_item_offset(leaf, item, ioff + size_diff);
C
Chris Mason 已提交
2946
	}
2947 2948 2949 2950 2951 2952

	if (leaf->map_token) {
		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
		leaf->map_token = NULL;
	}

C
Chris Mason 已提交
2953
	/* shift the data */
2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976
	if (from_end) {
		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
			      data_end + size_diff, btrfs_leaf_data(leaf) +
			      data_end, old_data_start + new_size - data_end);
	} else {
		struct btrfs_disk_key disk_key;
		u64 offset;

		btrfs_item_key(leaf, &disk_key, slot);

		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
			unsigned long ptr;
			struct btrfs_file_extent_item *fi;

			fi = btrfs_item_ptr(leaf, slot,
					    struct btrfs_file_extent_item);
			fi = (struct btrfs_file_extent_item *)(
			     (unsigned long)fi - size_diff);

			if (btrfs_file_extent_type(leaf, fi) ==
			    BTRFS_FILE_EXTENT_INLINE) {
				ptr = btrfs_item_ptr_offset(leaf, slot);
				memmove_extent_buffer(leaf, ptr,
C
Chris Mason 已提交
2977 2978
				      (unsigned long)fi,
				      offsetof(struct btrfs_file_extent_item,
2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992
						 disk_bytenr));
			}
		}

		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
			      data_end + size_diff, btrfs_leaf_data(leaf) +
			      data_end, old_data_start - data_end);

		offset = btrfs_disk_key_offset(&disk_key);
		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
		btrfs_set_item_key(leaf, &disk_key, slot);
		if (slot == 0)
			fixup_low_keys(trans, root, path, &disk_key, 1);
	}
2993 2994 2995 2996

	item = btrfs_item_nr(leaf, slot);
	btrfs_set_item_size(leaf, item, new_size);
	btrfs_mark_buffer_dirty(leaf);
C
Chris Mason 已提交
2997 2998

	ret = 0;
2999 3000
	if (btrfs_leaf_free_space(root, leaf) < 0) {
		btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
3001
		BUG();
3002
	}
C
Chris Mason 已提交
3003 3004 3005
	return ret;
}

C
Chris Mason 已提交
3006 3007 3008
/*
 * make the item pointed to by the path bigger, data_size is the new size.
 */
3009 3010 3011
int btrfs_extend_item(struct btrfs_trans_handle *trans,
		      struct btrfs_root *root, struct btrfs_path *path,
		      u32 data_size)
3012 3013 3014 3015
{
	int ret = 0;
	int slot;
	int slot_orig;
3016 3017
	struct extent_buffer *leaf;
	struct btrfs_item *item;
3018 3019 3020 3021 3022 3023 3024
	u32 nritems;
	unsigned int data_end;
	unsigned int old_data;
	unsigned int old_size;
	int i;

	slot_orig = path->slots[0];
3025
	leaf = path->nodes[0];
3026

3027
	nritems = btrfs_header_nritems(leaf);
3028 3029
	data_end = leaf_data_end(root, leaf);

3030 3031
	if (btrfs_leaf_free_space(root, leaf) < data_size) {
		btrfs_print_leaf(root, leaf);
3032
		BUG();
3033
	}
3034
	slot = path->slots[0];
3035
	old_data = btrfs_item_end_nr(leaf, slot);
3036 3037

	BUG_ON(slot < 0);
3038 3039
	if (slot >= nritems) {
		btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
3040 3041
		printk(KERN_CRIT "slot %d too large, nritems %d\n",
		       slot, nritems);
3042 3043
		BUG_ON(1);
	}
3044 3045 3046 3047 3048 3049

	/*
	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
	 */
	/* first correct the data pointers */
	for (i = slot; i < nritems; i++) {
3050 3051
		u32 ioff;
		item = btrfs_item_nr(leaf, i);
3052 3053 3054 3055 3056 3057 3058 3059

		if (!leaf->map_token) {
			map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
		}
3060 3061
		ioff = btrfs_item_offset(leaf, item);
		btrfs_set_item_offset(leaf, item, ioff - data_size);
3062
	}
3063

3064 3065 3066 3067 3068
	if (leaf->map_token) {
		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
		leaf->map_token = NULL;
	}

3069
	/* shift the data */
3070
	memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3071 3072
		      data_end - data_size, btrfs_leaf_data(leaf) +
		      data_end, old_data - data_end);
3073

3074
	data_end = old_data;
3075 3076 3077 3078
	old_size = btrfs_item_size_nr(leaf, slot);
	item = btrfs_item_nr(leaf, slot);
	btrfs_set_item_size(leaf, item, old_size + data_size);
	btrfs_mark_buffer_dirty(leaf);
3079 3080

	ret = 0;
3081 3082
	if (btrfs_leaf_free_space(root, leaf) < 0) {
		btrfs_print_leaf(root, leaf);
3083
		BUG();
3084
	}
3085 3086 3087
	return ret;
}

3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110
/*
 * Given a key and some data, insert items into the tree.
 * This does all the path init required, making room in the tree if needed.
 * Returns the number of keys that were inserted.
 */
int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root,
			    struct btrfs_path *path,
			    struct btrfs_key *cpu_key, u32 *data_size,
			    int nr)
{
	struct extent_buffer *leaf;
	struct btrfs_item *item;
	int ret = 0;
	int slot;
	int i;
	u32 nritems;
	u32 total_data = 0;
	u32 total_size = 0;
	unsigned int data_end;
	struct btrfs_disk_key disk_key;
	struct btrfs_key found_key;

3111 3112 3113 3114 3115 3116
	for (i = 0; i < nr; i++) {
		if (total_size + data_size[i] + sizeof(struct btrfs_item) >
		    BTRFS_LEAF_DATA_SIZE(root)) {
			break;
			nr = i;
		}
3117
		total_data += data_size[i];
3118 3119 3120
		total_size += data_size[i] + sizeof(struct btrfs_item);
	}
	BUG_ON(nr == 0);
3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162

	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
	if (ret == 0)
		return -EEXIST;
	if (ret < 0)
		goto out;

	leaf = path->nodes[0];

	nritems = btrfs_header_nritems(leaf);
	data_end = leaf_data_end(root, leaf);

	if (btrfs_leaf_free_space(root, leaf) < total_size) {
		for (i = nr; i >= 0; i--) {
			total_data -= data_size[i];
			total_size -= data_size[i] + sizeof(struct btrfs_item);
			if (total_size < btrfs_leaf_free_space(root, leaf))
				break;
		}
		nr = i;
	}

	slot = path->slots[0];
	BUG_ON(slot < 0);

	if (slot != nritems) {
		unsigned int old_data = btrfs_item_end_nr(leaf, slot);

		item = btrfs_item_nr(leaf, slot);
		btrfs_item_key_to_cpu(leaf, &found_key, slot);

		/* figure out how many keys we can insert in here */
		total_data = data_size[0];
		for (i = 1; i < nr; i++) {
			if (comp_cpu_keys(&found_key, cpu_key + i) <= 0)
				break;
			total_data += data_size[i];
		}
		nr = i;

		if (old_data < data_end) {
			btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
3163
			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239
			       slot, old_data, data_end);
			BUG_ON(1);
		}
		/*
		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
		 */
		/* first correct the data pointers */
		WARN_ON(leaf->map_token);
		for (i = slot; i < nritems; i++) {
			u32 ioff;

			item = btrfs_item_nr(leaf, i);
			if (!leaf->map_token) {
				map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
			}

			ioff = btrfs_item_offset(leaf, item);
			btrfs_set_item_offset(leaf, item, ioff - total_data);
		}
		if (leaf->map_token) {
			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
			leaf->map_token = NULL;
		}

		/* shift the items */
		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
			      btrfs_item_nr_offset(slot),
			      (nritems - slot) * sizeof(struct btrfs_item));

		/* shift the data */
		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
			      data_end - total_data, btrfs_leaf_data(leaf) +
			      data_end, old_data - data_end);
		data_end = old_data;
	} else {
		/*
		 * this sucks but it has to be done, if we are inserting at
		 * the end of the leaf only insert 1 of the items, since we
		 * have no way of knowing whats on the next leaf and we'd have
		 * to drop our current locks to figure it out
		 */
		nr = 1;
	}

	/* setup the item for the new data */
	for (i = 0; i < nr; i++) {
		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
		btrfs_set_item_key(leaf, &disk_key, slot + i);
		item = btrfs_item_nr(leaf, slot + i);
		btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
		data_end -= data_size[i];
		btrfs_set_item_size(leaf, item, data_size[i]);
	}
	btrfs_set_header_nritems(leaf, nritems + nr);
	btrfs_mark_buffer_dirty(leaf);

	ret = 0;
	if (slot == 0) {
		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
	}

	if (btrfs_leaf_free_space(root, leaf) < 0) {
		btrfs_print_leaf(root, leaf);
		BUG();
	}
out:
	if (!ret)
		ret = nr;
	return ret;
}

C
Chris Mason 已提交
3240
/*
C
Chris Mason 已提交
3241
 * Given a key and some data, insert items into the tree.
C
Chris Mason 已提交
3242 3243
 * This does all the path init required, making room in the tree if needed.
 */
3244
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3245 3246
			    struct btrfs_root *root,
			    struct btrfs_path *path,
3247 3248
			    struct btrfs_key *cpu_key, u32 *data_size,
			    int nr)
3249
{
3250 3251
	struct extent_buffer *leaf;
	struct btrfs_item *item;
C
Chris Mason 已提交
3252
	int ret = 0;
3253
	int slot;
3254
	int slot_orig;
3255
	int i;
3256
	u32 nritems;
3257 3258
	u32 total_size = 0;
	u32 total_data = 0;
3259
	unsigned int data_end;
C
Chris Mason 已提交
3260 3261
	struct btrfs_disk_key disk_key;

C
Chris Mason 已提交
3262
	for (i = 0; i < nr; i++)
3263
		total_data += data_size[i];
3264

3265
	total_size = total_data + (nr * sizeof(struct btrfs_item));
3266
	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
J
Josef Bacik 已提交
3267
	if (ret == 0)
3268
		return -EEXIST;
3269 3270
	if (ret < 0)
		goto out;
3271

3272
	slot_orig = path->slots[0];
3273
	leaf = path->nodes[0];
C
Chris Mason 已提交
3274

3275
	nritems = btrfs_header_nritems(leaf);
C
Chris Mason 已提交
3276
	data_end = leaf_data_end(root, leaf);
3277

3278
	if (btrfs_leaf_free_space(root, leaf) < total_size) {
3279
		btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
3280
		printk(KERN_CRIT "not enough freespace need %u have %d\n",
3281
		       total_size, btrfs_leaf_free_space(root, leaf));
3282
		BUG();
3283
	}
3284

3285
	slot = path->slots[0];
3286
	BUG_ON(slot < 0);
3287

3288
	if (slot != nritems) {
3289
		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3290

3291 3292
		if (old_data < data_end) {
			btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
3293
			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3294 3295 3296
			       slot, old_data, data_end);
			BUG_ON(1);
		}
3297 3298 3299 3300
		/*
		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
		 */
		/* first correct the data pointers */
3301
		WARN_ON(leaf->map_token);
C
Chris Mason 已提交
3302
		for (i = slot; i < nritems; i++) {
3303
			u32 ioff;
3304

3305
			item = btrfs_item_nr(leaf, i);
3306 3307 3308 3309 3310 3311 3312 3313
			if (!leaf->map_token) {
				map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
			}

3314
			ioff = btrfs_item_offset(leaf, item);
3315
			btrfs_set_item_offset(leaf, item, ioff - total_data);
C
Chris Mason 已提交
3316
		}
3317 3318 3319 3320
		if (leaf->map_token) {
			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
			leaf->map_token = NULL;
		}
3321 3322

		/* shift the items */
3323
		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3324
			      btrfs_item_nr_offset(slot),
C
Chris Mason 已提交
3325
			      (nritems - slot) * sizeof(struct btrfs_item));
3326 3327

		/* shift the data */
3328
		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3329
			      data_end - total_data, btrfs_leaf_data(leaf) +
C
Chris Mason 已提交
3330
			      data_end, old_data - data_end);
3331 3332
		data_end = old_data;
	}
3333

3334
	/* setup the item for the new data */
3335 3336 3337 3338 3339 3340 3341 3342 3343
	for (i = 0; i < nr; i++) {
		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
		btrfs_set_item_key(leaf, &disk_key, slot + i);
		item = btrfs_item_nr(leaf, slot + i);
		btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
		data_end -= data_size[i];
		btrfs_set_item_size(leaf, item, data_size[i]);
	}
	btrfs_set_header_nritems(leaf, nritems + nr);
3344
	btrfs_mark_buffer_dirty(leaf);
C
Chris Mason 已提交
3345 3346

	ret = 0;
3347 3348
	if (slot == 0) {
		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3349
		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3350
	}
C
Chris Mason 已提交
3351

3352 3353
	if (btrfs_leaf_free_space(root, leaf) < 0) {
		btrfs_print_leaf(root, leaf);
3354
		BUG();
3355
	}
3356
out:
3357 3358 3359 3360 3361 3362 3363
	return ret;
}

/*
 * Given a key and some data, insert an item into the tree.
 * This does all the path init required, making room in the tree if needed.
 */
3364 3365 3366
int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
		      *root, struct btrfs_key *cpu_key, void *data, u32
		      data_size)
3367 3368
{
	int ret = 0;
C
Chris Mason 已提交
3369
	struct btrfs_path *path;
3370 3371
	struct extent_buffer *leaf;
	unsigned long ptr;
3372

C
Chris Mason 已提交
3373 3374 3375
	path = btrfs_alloc_path();
	BUG_ON(!path);
	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3376
	if (!ret) {
3377 3378 3379 3380
		leaf = path->nodes[0];
		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
		write_extent_buffer(leaf, data, ptr, data_size);
		btrfs_mark_buffer_dirty(leaf);
3381
	}
C
Chris Mason 已提交
3382
	btrfs_free_path(path);
C
Chris Mason 已提交
3383
	return ret;
3384 3385
}

C
Chris Mason 已提交
3386
/*
C
Chris Mason 已提交
3387
 * delete the pointer from a given node.
C
Chris Mason 已提交
3388
 *
C
Chris Mason 已提交
3389 3390
 * the tree should have been previously balanced so the deletion does not
 * empty a node.
C
Chris Mason 已提交
3391
 */
3392 3393
static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		   struct btrfs_path *path, int level, int slot)
3394
{
3395
	struct extent_buffer *parent = path->nodes[level];
3396
	u32 nritems;
C
Chris Mason 已提交
3397
	int ret = 0;
3398
	int wret;
3399

3400
	nritems = btrfs_header_nritems(parent);
C
Chris Mason 已提交
3401
	if (slot != nritems - 1) {
3402 3403 3404
		memmove_extent_buffer(parent,
			      btrfs_node_key_ptr_offset(slot),
			      btrfs_node_key_ptr_offset(slot + 1),
C
Chris Mason 已提交
3405 3406
			      sizeof(struct btrfs_key_ptr) *
			      (nritems - slot - 1));
3407
	}
3408
	nritems--;
3409
	btrfs_set_header_nritems(parent, nritems);
3410
	if (nritems == 0 && parent == root->node) {
3411
		BUG_ON(btrfs_header_level(root->node) != 1);
3412
		/* just turn the root into a leaf and break */
3413
		btrfs_set_header_level(root->node, 0);
3414
	} else if (slot == 0) {
3415 3416 3417 3418
		struct btrfs_disk_key disk_key;

		btrfs_node_key(parent, &disk_key, 0);
		wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
C
Chris Mason 已提交
3419 3420
		if (wret)
			ret = wret;
3421
	}
C
Chris Mason 已提交
3422
	btrfs_mark_buffer_dirty(parent);
C
Chris Mason 已提交
3423
	return ret;
3424 3425
}

3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452
/*
 * a helper function to delete the leaf pointed to by path->slots[1] and
 * path->nodes[1].  bytenr is the node block pointer, but since the callers
 * already know it, it is faster to have them pass it down than to
 * read it out of the node again.
 *
 * This deletes the pointer in path->nodes[1] and frees the leaf
 * block extent.  zero is returned if it all worked out, < 0 otherwise.
 *
 * The path must have already been setup for deleting the leaf, including
 * all the proper balancing.  path->nodes[1] must be locked.
 */
noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root,
			    struct btrfs_path *path, u64 bytenr)
{
	int ret;
	u64 root_gen = btrfs_header_generation(path->nodes[1]);

	ret = del_ptr(trans, root, path, 1, path->slots[1]);
	if (ret)
		return ret;

	ret = btrfs_free_extent(trans, root, bytenr,
				btrfs_level_size(root, 0),
				path->nodes[1]->start,
				btrfs_header_owner(path->nodes[1]),
3453
				root_gen, 0, 1);
3454 3455
	return ret;
}
C
Chris Mason 已提交
3456 3457 3458 3459
/*
 * delete the item at the leaf level in path.  If that empties
 * the leaf, remove it from the tree
 */
3460 3461
int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		    struct btrfs_path *path, int slot, int nr)
3462
{
3463 3464
	struct extent_buffer *leaf;
	struct btrfs_item *item;
3465 3466
	int last_off;
	int dsize = 0;
C
Chris Mason 已提交
3467 3468
	int ret = 0;
	int wret;
3469
	int i;
3470
	u32 nritems;
3471

3472
	leaf = path->nodes[0];
3473 3474 3475 3476 3477
	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);

	for (i = 0; i < nr; i++)
		dsize += btrfs_item_size_nr(leaf, slot + i);

3478
	nritems = btrfs_header_nritems(leaf);
3479

3480
	if (slot + nr != nritems) {
C
Chris Mason 已提交
3481
		int data_end = leaf_data_end(root, leaf);
3482 3483

		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
C
Chris Mason 已提交
3484 3485
			      data_end + dsize,
			      btrfs_leaf_data(leaf) + data_end,
3486
			      last_off - data_end);
3487

3488
		for (i = slot + nr; i < nritems; i++) {
3489
			u32 ioff;
3490

3491
			item = btrfs_item_nr(leaf, i);
3492 3493 3494 3495 3496 3497 3498
			if (!leaf->map_token) {
				map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
			}
3499 3500
			ioff = btrfs_item_offset(leaf, item);
			btrfs_set_item_offset(leaf, item, ioff + dsize);
C
Chris Mason 已提交
3501
		}
3502 3503 3504 3505 3506 3507

		if (leaf->map_token) {
			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
			leaf->map_token = NULL;
		}

3508
		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
3509
			      btrfs_item_nr_offset(slot + nr),
C
Chris Mason 已提交
3510
			      sizeof(struct btrfs_item) *
3511
			      (nritems - slot - nr));
3512
	}
3513 3514
	btrfs_set_header_nritems(leaf, nritems - nr);
	nritems -= nr;
3515

C
Chris Mason 已提交
3516
	/* delete the leaf if we've emptied it */
3517
	if (nritems == 0) {
3518 3519
		if (leaf == root->node) {
			btrfs_set_header_level(leaf, 0);
3520
		} else {
3521 3522
			ret = btrfs_del_leaf(trans, root, path, leaf->start);
			BUG_ON(ret);
3523
		}
3524
	} else {
3525
		int used = leaf_space_used(leaf, 0, nritems);
C
Chris Mason 已提交
3526
		if (slot == 0) {
3527 3528 3529
			struct btrfs_disk_key disk_key;

			btrfs_item_key(leaf, &disk_key, 0);
3530
			wret = fixup_low_keys(trans, root, path,
3531
					      &disk_key, 1);
C
Chris Mason 已提交
3532 3533 3534 3535
			if (wret)
				ret = wret;
		}

C
Chris Mason 已提交
3536
		/* delete the leaf if it is mostly empty */
3537
		if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
3538 3539 3540 3541
			/* push_leaf_left fixes the path.
			 * make sure the path still points to our leaf
			 * for possible call to del_ptr below
			 */
3542
			slot = path->slots[1];
3543 3544
			extent_buffer_get(leaf);

3545
			wret = push_leaf_left(trans, root, path, 1, 1);
3546
			if (wret < 0 && wret != -ENOSPC)
C
Chris Mason 已提交
3547
				ret = wret;
3548 3549 3550

			if (path->nodes[0] == leaf &&
			    btrfs_header_nritems(leaf)) {
3551
				wret = push_leaf_right(trans, root, path, 1, 1);
3552
				if (wret < 0 && wret != -ENOSPC)
C
Chris Mason 已提交
3553 3554
					ret = wret;
			}
3555 3556

			if (btrfs_header_nritems(leaf) == 0) {
3557
				path->slots[1] = slot;
C
Chris Mason 已提交
3558 3559
				ret = btrfs_del_leaf(trans, root, path,
						     leaf->start);
3560
				BUG_ON(ret);
3561
				free_extent_buffer(leaf);
C
Chris Mason 已提交
3562
			} else {
3563 3564 3565 3566 3567 3568 3569
				/* if we're still in the path, make sure
				 * we're dirty.  Otherwise, one of the
				 * push_leaf functions must have already
				 * dirtied this buffer
				 */
				if (path->nodes[0] == leaf)
					btrfs_mark_buffer_dirty(leaf);
3570
				free_extent_buffer(leaf);
3571
			}
3572
		} else {
3573
			btrfs_mark_buffer_dirty(leaf);
3574 3575
		}
	}
C
Chris Mason 已提交
3576
	return ret;
3577 3578
}

3579
/*
3580
 * search the tree again to find a leaf with lesser keys
3581 3582
 * returns 0 if it found something or 1 if there are no lesser leaves.
 * returns < 0 on io errors.
C
Chris Mason 已提交
3583 3584 3585
 *
 * This may release the path, and so you may lose any locks held at the
 * time you call it.
3586 3587 3588
 */
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
{
3589 3590 3591
	struct btrfs_key key;
	struct btrfs_disk_key found_key;
	int ret;
3592

3593
	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
3594

3595 3596 3597 3598 3599 3600 3601 3602
	if (key.offset > 0)
		key.offset--;
	else if (key.type > 0)
		key.type--;
	else if (key.objectid > 0)
		key.objectid--;
	else
		return 1;
3603

3604 3605 3606 3607 3608 3609 3610 3611 3612
	btrfs_release_path(root, path);
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0)
		return ret;
	btrfs_item_key(path->nodes[0], &found_key, 0);
	ret = comp_keys(&found_key, &key);
	if (ret < 0)
		return 0;
	return 1;
3613 3614
}

3615 3616 3617
/*
 * A helper function to walk down the tree starting at min_key, and looking
 * for nodes or leaves that are either in cache or have a minimum
C
Chris Mason 已提交
3618
 * transaction id.  This is used by the btree defrag code, and tree logging
3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629
 *
 * This does not cow, but it does stuff the starting key it finds back
 * into min_key, so you can call btrfs_search_slot with cow=1 on the
 * key and get a writable path.
 *
 * This does lock as it descends, and path->keep_locks should be set
 * to 1 by the caller.
 *
 * This honors path->lowest_level to prevent descent past a given level
 * of the tree.
 *
C
Chris Mason 已提交
3630 3631 3632 3633
 * min_trans indicates the oldest transaction that you are interested
 * in walking through.  Any nodes or leaves older than min_trans are
 * skipped over (without reading them).
 *
3634 3635 3636 3637
 * returns zero if something useful was found, < 0 on error and 1 if there
 * was nothing in the tree that matched the search criteria.
 */
int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
3638
			 struct btrfs_key *max_key,
3639 3640 3641 3642 3643 3644
			 struct btrfs_path *path, int cache_only,
			 u64 min_trans)
{
	struct extent_buffer *cur;
	struct btrfs_key found_key;
	int slot;
3645
	int sret;
3646 3647 3648 3649
	u32 nritems;
	int level;
	int ret = 1;

3650
	WARN_ON(!path->keep_locks);
3651 3652 3653
again:
	cur = btrfs_lock_root_node(root);
	level = btrfs_header_level(cur);
3654
	WARN_ON(path->nodes[level]);
3655 3656 3657 3658 3659 3660 3661
	path->nodes[level] = cur;
	path->locks[level] = 1;

	if (btrfs_header_generation(cur) < min_trans) {
		ret = 1;
		goto out;
	}
C
Chris Mason 已提交
3662
	while (1) {
3663 3664
		nritems = btrfs_header_nritems(cur);
		level = btrfs_header_level(cur);
3665
		sret = bin_search(cur, min_key, level, &slot);
3666

3667 3668
		/* at the lowest level, we're done, setup the path and exit */
		if (level == path->lowest_level) {
3669 3670
			if (slot >= nritems)
				goto find_next_key;
3671 3672 3673 3674 3675
			ret = 0;
			path->slots[level] = slot;
			btrfs_item_key_to_cpu(cur, &found_key, slot);
			goto out;
		}
3676 3677
		if (sret && slot > 0)
			slot--;
3678 3679 3680 3681 3682
		/*
		 * check this node pointer against the cache_only and
		 * min_trans parameters.  If it isn't in cache or is too
		 * old, skip to the next one.
		 */
C
Chris Mason 已提交
3683
		while (slot < nritems) {
3684 3685 3686
			u64 blockptr;
			u64 gen;
			struct extent_buffer *tmp;
3687 3688
			struct btrfs_disk_key disk_key;

3689 3690 3691 3692 3693 3694 3695 3696 3697
			blockptr = btrfs_node_blockptr(cur, slot);
			gen = btrfs_node_ptr_generation(cur, slot);
			if (gen < min_trans) {
				slot++;
				continue;
			}
			if (!cache_only)
				break;

3698 3699 3700 3701 3702 3703 3704 3705
			if (max_key) {
				btrfs_node_key(cur, &disk_key, slot);
				if (comp_keys(&disk_key, max_key) >= 0) {
					ret = 1;
					goto out;
				}
			}

3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716
			tmp = btrfs_find_tree_block(root, blockptr,
					    btrfs_level_size(root, level - 1));

			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
				free_extent_buffer(tmp);
				break;
			}
			if (tmp)
				free_extent_buffer(tmp);
			slot++;
		}
3717
find_next_key:
3718 3719 3720 3721 3722
		/*
		 * we didn't find a candidate key in this node, walk forward
		 * and find another one
		 */
		if (slot >= nritems) {
3723 3724
			path->slots[level] = slot;
			sret = btrfs_find_next_key(root, path, min_key, level,
3725
						  cache_only, min_trans);
3726
			if (sret == 0) {
3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765
				btrfs_release_path(root, path);
				goto again;
			} else {
				goto out;
			}
		}
		/* save our key for returning back */
		btrfs_node_key_to_cpu(cur, &found_key, slot);
		path->slots[level] = slot;
		if (level == path->lowest_level) {
			ret = 0;
			unlock_up(path, level, 1);
			goto out;
		}
		cur = read_node_slot(root, cur, slot);

		btrfs_tree_lock(cur);
		path->locks[level - 1] = 1;
		path->nodes[level - 1] = cur;
		unlock_up(path, level, 1);
	}
out:
	if (ret == 0)
		memcpy(min_key, &found_key, sizeof(found_key));
	return ret;
}

/*
 * this is similar to btrfs_next_leaf, but does not try to preserve
 * and fixup the path.  It looks for and returns the next key in the
 * tree based on the current path and the cache_only and min_trans
 * parameters.
 *
 * 0 is returned if another key is found, < 0 if there are any errors
 * and 1 is returned if there are no higher keys in the tree
 *
 * path->keep_locks should be set to 1 on the search made before
 * calling this function.
 */
3766
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
3767 3768
			struct btrfs_key *key, int lowest_level,
			int cache_only, u64 min_trans)
3769 3770 3771 3772 3773
{
	int level = lowest_level;
	int slot;
	struct extent_buffer *c;

3774
	WARN_ON(!path->keep_locks);
C
Chris Mason 已提交
3775
	while (level < BTRFS_MAX_LEVEL) {
3776 3777 3778 3779 3780
		if (!path->nodes[level])
			return 1;

		slot = path->slots[level] + 1;
		c = path->nodes[level];
3781
next:
3782 3783
		if (slot >= btrfs_header_nritems(c)) {
			level++;
C
Chris Mason 已提交
3784
			if (level == BTRFS_MAX_LEVEL)
3785 3786 3787 3788 3789
				return 1;
			continue;
		}
		if (level == 0)
			btrfs_item_key_to_cpu(c, key, slot);
3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809
		else {
			u64 blockptr = btrfs_node_blockptr(c, slot);
			u64 gen = btrfs_node_ptr_generation(c, slot);

			if (cache_only) {
				struct extent_buffer *cur;
				cur = btrfs_find_tree_block(root, blockptr,
					    btrfs_level_size(root, level - 1));
				if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
					slot++;
					if (cur)
						free_extent_buffer(cur);
					goto next;
				}
				free_extent_buffer(cur);
			}
			if (gen < min_trans) {
				slot++;
				goto next;
			}
3810
			btrfs_node_key_to_cpu(c, key, slot);
3811
		}
3812 3813 3814 3815 3816
		return 0;
	}
	return 1;
}

C
Chris Mason 已提交
3817
/*
3818
 * search the tree again to find a leaf with greater keys
C
Chris Mason 已提交
3819 3820
 * returns 0 if it found something or 1 if there are no greater leaves.
 * returns < 0 on io errors.
C
Chris Mason 已提交
3821
 */
C
Chris Mason 已提交
3822
int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3823 3824 3825
{
	int slot;
	int level = 1;
3826 3827
	struct extent_buffer *c;
	struct extent_buffer *next = NULL;
3828 3829 3830 3831 3832
	struct btrfs_key key;
	u32 nritems;
	int ret;

	nritems = btrfs_header_nritems(path->nodes[0]);
C
Chris Mason 已提交
3833
	if (nritems == 0)
3834 3835 3836 3837 3838
		return 1;

	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);

	btrfs_release_path(root, path);
3839
	path->keep_locks = 1;
3840 3841 3842 3843 3844 3845
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	path->keep_locks = 0;

	if (ret < 0)
		return ret;

3846
	nritems = btrfs_header_nritems(path->nodes[0]);
3847 3848 3849 3850 3851 3852
	/*
	 * by releasing the path above we dropped all our locks.  A balance
	 * could have added more items next to the key that used to be
	 * at the very end of the block.  So, check again here and
	 * advance the path if there are now more items available.
	 */
3853
	if (nritems > 0 && path->slots[0] < nritems - 1) {
3854
		path->slots[0]++;
3855 3856
		goto done;
	}
3857

C
Chris Mason 已提交
3858
	while (level < BTRFS_MAX_LEVEL) {
3859
		if (!path->nodes[level])
C
Chris Mason 已提交
3860
			return 1;
3861

3862 3863
		slot = path->slots[level] + 1;
		c = path->nodes[level];
3864
		if (slot >= btrfs_header_nritems(c)) {
3865
			level++;
C
Chris Mason 已提交
3866
			if (level == BTRFS_MAX_LEVEL)
3867
				return 1;
3868 3869
			continue;
		}
3870

3871 3872
		if (next) {
			btrfs_tree_unlock(next);
3873
			free_extent_buffer(next);
3874
		}
3875

3876 3877
		if (level == 1 && (path->locks[1] || path->skip_locking) &&
		    path->reada)
3878
			reada_for_search(root, path, level, slot, 0);
3879

3880
		next = read_node_slot(root, c, slot);
3881 3882 3883 3884
		if (!path->skip_locking) {
			WARN_ON(!btrfs_tree_locked(c));
			btrfs_tree_lock(next);
		}
3885 3886 3887
		break;
	}
	path->slots[level] = slot;
C
Chris Mason 已提交
3888
	while (1) {
3889 3890
		level--;
		c = path->nodes[level];
3891 3892
		if (path->locks[level])
			btrfs_tree_unlock(c);
3893
		free_extent_buffer(c);
3894 3895
		path->nodes[level] = next;
		path->slots[level] = 0;
3896 3897
		if (!path->skip_locking)
			path->locks[level] = 1;
3898 3899
		if (!level)
			break;
3900 3901
		if (level == 1 && path->locks[1] && path->reada)
			reada_for_search(root, path, level, slot, 0);
3902
		next = read_node_slot(root, next, 0);
3903 3904 3905 3906
		if (!path->skip_locking) {
			WARN_ON(!btrfs_tree_locked(path->nodes[level]));
			btrfs_tree_lock(next);
		}
3907
	}
3908 3909
done:
	unlock_up(path, 0, 1);
3910 3911
	return 0;
}
3912

3913 3914 3915 3916 3917 3918
/*
 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
 * searching until it gets past min_objectid or finds an item of 'type'
 *
 * returns 0 if something is found, 1 if nothing was found and < 0 on error
 */
3919 3920 3921 3922 3923 3924
int btrfs_previous_item(struct btrfs_root *root,
			struct btrfs_path *path, u64 min_objectid,
			int type)
{
	struct btrfs_key found_key;
	struct extent_buffer *leaf;
3925
	u32 nritems;
3926 3927
	int ret;

C
Chris Mason 已提交
3928
	while (1) {
3929 3930 3931 3932 3933 3934 3935 3936
		if (path->slots[0] == 0) {
			ret = btrfs_prev_leaf(root, path);
			if (ret != 0)
				return ret;
		} else {
			path->slots[0]--;
		}
		leaf = path->nodes[0];
3937 3938 3939 3940 3941 3942
		nritems = btrfs_header_nritems(leaf);
		if (nritems == 0)
			return 1;
		if (path->slots[0] == nritems)
			path->slots[0]--;

3943 3944 3945
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
		if (found_key.type == type)
			return 0;
3946 3947 3948 3949 3950
		if (found_key.objectid < min_objectid)
			break;
		if (found_key.objectid == min_objectid &&
		    found_key.type < type)
			break;
3951 3952 3953
	}
	return 1;
}