inode-map.c 14.7 KB
Newer Older
C
Chris Mason 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

19 20 21 22
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/pagemap.h>

23 24
#include "ctree.h"
#include "disk-io.h"
25 26
#include "free-space-cache.h"
#include "inode-map.h"
27 28
#include "transaction.h"

29 30 31 32 33 34 35 36 37 38 39 40
static int caching_kthread(void *data)
{
	struct btrfs_root *root = data;
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
	struct btrfs_key key;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	u64 last = (u64)-1;
	int slot;
	int ret;

41
	if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
C
Chris Mason 已提交
42 43
		return 0;

44 45 46 47 48 49 50
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	/* Since the commit root is read-only, we can safely skip locking. */
	path->skip_locking = 1;
	path->search_commit_root = 1;
51
	path->reada = READA_FORWARD;
52 53 54 55 56 57

	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
	key.offset = 0;
	key.type = BTRFS_INODE_ITEM_KEY;
again:
	/* need to make sure the commit_root doesn't disappear */
58
	down_read(&fs_info->commit_root_sem);
59 60 61 62 63 64

	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0)
		goto out;

	while (1) {
65
		if (btrfs_fs_closing(fs_info))
66 67 68 69
			goto out;

		leaf = path->nodes[0];
		slot = path->slots[0];
70
		if (slot >= btrfs_header_nritems(leaf)) {
71 72 73 74 75 76 77 78 79 80
			ret = btrfs_next_leaf(root, path);
			if (ret < 0)
				goto out;
			else if (ret > 0)
				break;

			if (need_resched() ||
			    btrfs_transaction_in_commit(fs_info)) {
				leaf = path->nodes[0];

81
				if (WARN_ON(btrfs_header_nritems(leaf) == 0))
82 83 84 85 86 87 88
					break;

				/*
				 * Save the key so we can advances forward
				 * in the next search.
				 */
				btrfs_item_key_to_cpu(leaf, &key, 0);
89
				btrfs_release_path(path);
90
				root->ino_cache_progress = last;
91
				up_read(&fs_info->commit_root_sem);
92 93 94 95 96 97 98 99 100 101 102
				schedule_timeout(1);
				goto again;
			} else
				continue;
		}

		btrfs_item_key_to_cpu(leaf, &key, slot);

		if (key.type != BTRFS_INODE_ITEM_KEY)
			goto next;

103
		if (key.objectid >= root->highest_objectid)
104 105 106
			break;

		if (last != (u64)-1 && last + 1 != key.objectid) {
107
			__btrfs_add_free_space(fs_info, ctl, last + 1,
108
					       key.objectid - last - 1);
109
			wake_up(&root->ino_cache_wait);
110 111 112 113 114 115 116
		}

		last = key.objectid;
next:
		path->slots[0]++;
	}

117
	if (last < root->highest_objectid - 1) {
118
		__btrfs_add_free_space(fs_info, ctl, last + 1,
119
				       root->highest_objectid - last - 1);
120 121
	}

122 123 124
	spin_lock(&root->ino_cache_lock);
	root->ino_cache_state = BTRFS_CACHE_FINISHED;
	spin_unlock(&root->ino_cache_lock);
125

126
	root->ino_cache_progress = (u64)-1;
127 128
	btrfs_unpin_free_ino(root);
out:
129
	wake_up(&root->ino_cache_wait);
130
	up_read(&fs_info->commit_root_sem);
131 132 133 134 135 136 137 138

	btrfs_free_path(path);

	return ret;
}

static void start_caching(struct btrfs_root *root)
{
139
	struct btrfs_fs_info *fs_info = root->fs_info;
140
	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
141
	struct task_struct *tsk;
142
	int ret;
143
	u64 objectid;
144

145
	if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
C
Chris Mason 已提交
146 147
		return;

148 149 150
	spin_lock(&root->ino_cache_lock);
	if (root->ino_cache_state != BTRFS_CACHE_NO) {
		spin_unlock(&root->ino_cache_lock);
151 152 153
		return;
	}

154 155
	root->ino_cache_state = BTRFS_CACHE_STARTED;
	spin_unlock(&root->ino_cache_lock);
156

157
	ret = load_free_ino_cache(fs_info, root);
158
	if (ret == 1) {
159 160 161
		spin_lock(&root->ino_cache_lock);
		root->ino_cache_state = BTRFS_CACHE_FINISHED;
		spin_unlock(&root->ino_cache_lock);
162 163 164
		return;
	}

165 166 167 168 169 170 171 172 173
	/*
	 * It can be quite time-consuming to fill the cache by searching
	 * through the extent tree, and this can keep ino allocation path
	 * waiting. Therefore at start we quickly find out the highest
	 * inode number and we know we can use inode numbers which fall in
	 * [highest_ino + 1, BTRFS_LAST_FREE_OBJECTID].
	 */
	ret = btrfs_find_free_objectid(root, &objectid);
	if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) {
174
		__btrfs_add_free_space(fs_info, ctl, objectid,
175 176 177
				       BTRFS_LAST_FREE_OBJECTID - objectid + 1);
	}

178
	tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu",
179
			  root->root_key.objectid);
180
	if (IS_ERR(tsk)) {
181 182
		btrfs_warn(fs_info, "failed to start inode caching task");
		btrfs_clear_pending_and_info(fs_info, INODE_MAP_CACHE,
183
					     "disabling inode map caching");
184
	}
185 186 187 188
}

int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
{
189
	if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
C
Chris Mason 已提交
190 191
		return btrfs_find_free_objectid(root, objectid);

192 193 194 195 196 197 198 199
again:
	*objectid = btrfs_find_ino_for_alloc(root);

	if (*objectid != 0)
		return 0;

	start_caching(root);

200 201
	wait_event(root->ino_cache_wait,
		   root->ino_cache_state == BTRFS_CACHE_FINISHED ||
202 203
		   root->free_ino_ctl->free_space > 0);

204
	if (root->ino_cache_state == BTRFS_CACHE_FINISHED &&
205 206 207 208 209 210 211 212
	    root->free_ino_ctl->free_space == 0)
		return -ENOSPC;
	else
		goto again;
}

void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
{
213
	struct btrfs_fs_info *fs_info = root->fs_info;
214
	struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
C
Chris Mason 已提交
215

216
	if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
C
Chris Mason 已提交
217
		return;
218
again:
219
	if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
220
		__btrfs_add_free_space(fs_info, pinned, objectid, 1);
221
	} else {
222
		down_write(&fs_info->commit_root_sem);
223 224 225
		spin_lock(&root->ino_cache_lock);
		if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
			spin_unlock(&root->ino_cache_lock);
226
			up_write(&fs_info->commit_root_sem);
227 228
			goto again;
		}
229
		spin_unlock(&root->ino_cache_lock);
230 231 232

		start_caching(root);

233
		__btrfs_add_free_space(fs_info, pinned, objectid, 1);
234

235
		up_write(&fs_info->commit_root_sem);
236 237 238 239
	}
}

/*
240 241 242 243
 * When a transaction is committed, we'll move those inode numbers which are
 * smaller than root->ino_cache_progress from pinned tree to free_ino tree, and
 * others will just be dropped, because the commit root we were searching has
 * changed.
244
 *
245
 * Must be called with root->fs_info->commit_root_sem held
246 247 248 249 250
 */
void btrfs_unpin_free_ino(struct btrfs_root *root)
{
	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
	struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset;
251
	spinlock_t *rbroot_lock = &root->free_ino_pinned->tree_lock;
252 253 254 255
	struct btrfs_free_space *info;
	struct rb_node *n;
	u64 count;

256
	if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
C
Chris Mason 已提交
257 258
		return;

259
	while (1) {
260 261 262
		bool add_to_ctl = true;

		spin_lock(rbroot_lock);
263
		n = rb_first(rbroot);
264 265
		if (!n) {
			spin_unlock(rbroot_lock);
266
			break;
267
		}
268 269

		info = rb_entry(n, struct btrfs_free_space, offset_index);
270
		BUG_ON(info->bitmap); /* Logic error */
271

272
		if (info->offset > root->ino_cache_progress)
273
			add_to_ctl = false;
274 275
		else if (info->offset + info->bytes > root->ino_cache_progress)
			count = root->ino_cache_progress - info->offset + 1;
276 277 278 279
		else
			count = info->bytes;

		rb_erase(&info->offset_index, rbroot);
280 281
		spin_unlock(rbroot_lock);
		if (add_to_ctl)
282 283
			__btrfs_add_free_space(root->fs_info, ctl,
					       info->offset, count);
284
		kmem_cache_free(btrfs_free_space_cachep, info);
285 286 287
	}
}

288
#define INIT_THRESHOLD	((SZ_32K / 2) / sizeof(struct btrfs_free_space))
289
#define INODES_PER_BITMAP (PAGE_SIZE * 8)
290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322

/*
 * The goal is to keep the memory used by the free_ino tree won't
 * exceed the memory if we use bitmaps only.
 */
static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
{
	struct btrfs_free_space *info;
	struct rb_node *n;
	int max_ino;
	int max_bitmaps;

	n = rb_last(&ctl->free_space_offset);
	if (!n) {
		ctl->extents_thresh = INIT_THRESHOLD;
		return;
	}
	info = rb_entry(n, struct btrfs_free_space, offset_index);

	/*
	 * Find the maximum inode number in the filesystem. Note we
	 * ignore the fact that this can be a bitmap, because we are
	 * not doing precise calculation.
	 */
	max_ino = info->bytes - 1;

	max_bitmaps = ALIGN(max_ino, INODES_PER_BITMAP) / INODES_PER_BITMAP;
	if (max_bitmaps <= ctl->total_bitmaps) {
		ctl->extents_thresh = 0;
		return;
	}

	ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) *
323
				PAGE_SIZE / sizeof(*info);
324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339
}

/*
 * We don't fall back to bitmap, if we are below the extents threshold
 * or this chunk of inode numbers is a big one.
 */
static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
		       struct btrfs_free_space *info)
{
	if (ctl->free_extents < ctl->extents_thresh ||
	    info->bytes > INODES_PER_BITMAP / 10)
		return false;

	return true;
}

340
static const struct btrfs_free_space_op free_ino_op = {
341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361
	.recalc_thresholds	= recalculate_thresholds,
	.use_bitmap		= use_bitmap,
};

static void pinned_recalc_thresholds(struct btrfs_free_space_ctl *ctl)
{
}

static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl,
			      struct btrfs_free_space *info)
{
	/*
	 * We always use extents for two reasons:
	 *
	 * - The pinned tree is only used during the process of caching
	 *   work.
	 * - Make code simpler. See btrfs_unpin_free_ino().
	 */
	return false;
}

362
static const struct btrfs_free_space_op pinned_free_ino_op = {
363 364 365 366 367 368 369 370 371 372 373 374 375 376
	.recalc_thresholds	= pinned_recalc_thresholds,
	.use_bitmap		= pinned_use_bitmap,
};

void btrfs_init_free_ino_ctl(struct btrfs_root *root)
{
	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
	struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;

	spin_lock_init(&ctl->tree_lock);
	ctl->unit = 1;
	ctl->start = 0;
	ctl->private = NULL;
	ctl->op = &free_ino_op;
377 378
	INIT_LIST_HEAD(&ctl->trimming_ranges);
	mutex_init(&ctl->cache_writeout_mutex);
379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394

	/*
	 * Initially we allow to use 16K of ram to cache chunks of
	 * inode numbers before we resort to bitmaps. This is somewhat
	 * arbitrary, but it will be adjusted in runtime.
	 */
	ctl->extents_thresh = INIT_THRESHOLD;

	spin_lock_init(&pinned->tree_lock);
	pinned->unit = 1;
	pinned->start = 0;
	pinned->private = NULL;
	pinned->extents_thresh = 0;
	pinned->op = &pinned_free_ino_op;
}

395 396 397
int btrfs_save_ino_cache(struct btrfs_root *root,
			 struct btrfs_trans_handle *trans)
{
398
	struct btrfs_fs_info *fs_info = root->fs_info;
399 400 401
	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
	struct btrfs_path *path;
	struct inode *inode;
402
	struct btrfs_block_rsv *rsv;
403
	struct extent_changeset *data_reserved = NULL;
404
	u64 num_bytes;
405 406 407 408 409
	u64 alloc_hint = 0;
	int ret;
	int prealloc;
	bool retry = false;

410 411 412 413 414 415
	/* only fs tree and subvol/snap needs ino cache */
	if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID &&
	    (root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID ||
	     root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID))
		return 0;

416
	/* Don't save inode cache if we are deleting this root */
417
	if (btrfs_root_refs(&root->root_item) == 0)
418 419
		return 0;

420
	if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
C
Chris Mason 已提交
421 422
		return 0;

423 424 425
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
C
Chris Mason 已提交
426

427
	rsv = trans->block_rsv;
428
	trans->block_rsv = &fs_info->trans_block_rsv;
429 430 431 432

	num_bytes = trans->bytes_reserved;
	/*
	 * 1 item for inode item insertion if need
433 434
	 * 4 items for inode item update (in the worst case)
	 * 1 items for slack space if we need do truncation
435 436 437
	 * 1 item for free space object
	 * 3 items for pre-allocation
	 */
438
	trans->bytes_reserved = btrfs_calc_trans_metadata_size(fs_info, 10);
M
Miao Xie 已提交
439 440 441
	ret = btrfs_block_rsv_add(root, trans->block_rsv,
				  trans->bytes_reserved,
				  BTRFS_RESERVE_NO_FLUSH);
442 443
	if (ret)
		goto out;
444 445
	trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid,
				      trans->bytes_reserved, 1);
446 447
again:
	inode = lookup_free_ino_inode(root, path);
448
	if (IS_ERR(inode) && (PTR_ERR(inode) != -ENOENT || retry)) {
449
		ret = PTR_ERR(inode);
450
		goto out_release;
451 452 453
	}

	if (IS_ERR(inode)) {
454
		BUG_ON(retry); /* Logic error */
455 456 457 458
		retry = true;

		ret = create_free_ino_inode(root, trans, path);
		if (ret)
459
			goto out_release;
460 461 462 463 464
		goto again;
	}

	BTRFS_I(inode)->generation = 0;
	ret = btrfs_update_inode(trans, root, inode);
465
	if (ret) {
466
		btrfs_abort_transaction(trans, ret);
467 468
		goto out_put;
	}
469 470

	if (i_size_read(inode) > 0) {
471
		ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
472
		if (ret) {
473
			if (ret != -ENOSPC)
474
				btrfs_abort_transaction(trans, ret);
475
			goto out_put;
476
		}
477 478
	}

479 480
	spin_lock(&root->ino_cache_lock);
	if (root->ino_cache_state != BTRFS_CACHE_FINISHED) {
481
		ret = -1;
482
		spin_unlock(&root->ino_cache_lock);
483 484
		goto out_put;
	}
485
	spin_unlock(&root->ino_cache_lock);
486 487 488

	spin_lock(&ctl->tree_lock);
	prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents;
489 490
	prealloc = ALIGN(prealloc, PAGE_SIZE);
	prealloc += ctl->total_bitmaps * PAGE_SIZE;
491 492 493
	spin_unlock(&ctl->tree_lock);

	/* Just to make sure we have enough space */
494
	prealloc += 8 * PAGE_SIZE;
495

496
	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, 0, prealloc);
497 498 499 500 501
	if (ret)
		goto out_put;

	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
					      prealloc, prealloc, &alloc_hint);
502
	if (ret) {
503
		btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, true);
504
		goto out_put;
505
	}
506

507
	ret = btrfs_write_out_ino_cache(root, trans, path, inode);
508
	btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, false);
509 510
out_put:
	iput(inode);
511
out_release:
512 513
	trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid,
				      trans->bytes_reserved, 0);
514 515
	btrfs_block_rsv_release(fs_info, trans->block_rsv,
				trans->bytes_reserved);
516
out:
517 518
	trans->block_rsv = rsv;
	trans->bytes_reserved = num_bytes;
519 520

	btrfs_free_path(path);
521
	extent_changeset_free(data_reserved);
522 523 524
	return ret;
}

525
int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
C
Chris Mason 已提交
526 527 528
{
	struct btrfs_path *path;
	int ret;
529
	struct extent_buffer *l;
C
Chris Mason 已提交
530
	struct btrfs_key search_key;
531
	struct btrfs_key found_key;
C
Chris Mason 已提交
532 533 534
	int slot;

	path = btrfs_alloc_path();
T
Tsutomu Itoh 已提交
535 536
	if (!path)
		return -ENOMEM;
C
Chris Mason 已提交
537

538 539
	search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
	search_key.type = -1;
C
Chris Mason 已提交
540 541 542 543
	search_key.offset = (u64)-1;
	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
	if (ret < 0)
		goto error;
544
	BUG_ON(ret == 0); /* Corruption */
C
Chris Mason 已提交
545 546
	if (path->slots[0] > 0) {
		slot = path->slots[0] - 1;
547 548
		l = path->nodes[0];
		btrfs_item_key_to_cpu(l, &found_key, slot);
549 550
		*objectid = max_t(u64, found_key.objectid,
				  BTRFS_FIRST_FREE_OBJECTID - 1);
C
Chris Mason 已提交
551
	} else {
552
		*objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
C
Chris Mason 已提交
553 554 555 556 557 558 559
	}
	ret = 0;
error:
	btrfs_free_path(path);
	return ret;
}

560
int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
561 562
{
	int ret;
563
	mutex_lock(&root->objectid_mutex);
564

565
	if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
566 567 568
		btrfs_warn(root->fs_info,
			   "the objectid of root %llu reaches its highest value",
			   root->root_key.objectid);
569 570
		ret = -ENOSPC;
		goto out;
571
	}
572 573 574 575

	*objectid = ++root->highest_objectid;
	ret = 0;
out:
576
	mutex_unlock(&root->objectid_mutex);
577 578
	return ret;
}