inode-map.c 13.4 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
C
Chris Mason 已提交
2 3 4 5
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

6 7 8
#include <linux/kthread.h>
#include <linux/pagemap.h>

9 10
#include "ctree.h"
#include "disk-io.h"
11 12
#include "free-space-cache.h"
#include "inode-map.h"
13
#include "transaction.h"
14
#include "delalloc-space.h"
15

16 17 18 19 20 21 22 23 24 25 26 27 28
static void fail_caching_thread(struct btrfs_root *root)
{
	struct btrfs_fs_info *fs_info = root->fs_info;

	btrfs_warn(fs_info, "failed to start inode caching task");
	btrfs_clear_pending_and_info(fs_info, INODE_MAP_CACHE,
				     "disabling inode map caching");
	spin_lock(&root->ino_cache_lock);
	root->ino_cache_state = BTRFS_CACHE_ERROR;
	spin_unlock(&root->ino_cache_lock);
	wake_up(&root->ino_cache_wait);
}

29 30 31 32 33 34 35 36 37 38 39 40
static int caching_kthread(void *data)
{
	struct btrfs_root *root = data;
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
	struct btrfs_key key;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	u64 last = (u64)-1;
	int slot;
	int ret;

41
	if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
C
Chris Mason 已提交
42 43
		return 0;

44
	path = btrfs_alloc_path();
45 46
	if (!path) {
		fail_caching_thread(root);
47
		return -ENOMEM;
48
	}
49 50 51 52

	/* Since the commit root is read-only, we can safely skip locking. */
	path->skip_locking = 1;
	path->search_commit_root = 1;
53
	path->reada = READA_FORWARD;
54 55 56 57 58 59

	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
	key.offset = 0;
	key.type = BTRFS_INODE_ITEM_KEY;
again:
	/* need to make sure the commit_root doesn't disappear */
60
	down_read(&fs_info->commit_root_sem);
61 62 63 64 65 66

	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0)
		goto out;

	while (1) {
67
		if (btrfs_fs_closing(fs_info))
68 69 70 71
			goto out;

		leaf = path->nodes[0];
		slot = path->slots[0];
72
		if (slot >= btrfs_header_nritems(leaf)) {
73 74 75 76 77 78 79 80 81 82
			ret = btrfs_next_leaf(root, path);
			if (ret < 0)
				goto out;
			else if (ret > 0)
				break;

			if (need_resched() ||
			    btrfs_transaction_in_commit(fs_info)) {
				leaf = path->nodes[0];

83
				if (WARN_ON(btrfs_header_nritems(leaf) == 0))
84 85 86 87 88 89 90
					break;

				/*
				 * Save the key so we can advances forward
				 * in the next search.
				 */
				btrfs_item_key_to_cpu(leaf, &key, 0);
91
				btrfs_release_path(path);
92
				root->ino_cache_progress = last;
93
				up_read(&fs_info->commit_root_sem);
94 95 96 97 98 99 100 101 102 103 104
				schedule_timeout(1);
				goto again;
			} else
				continue;
		}

		btrfs_item_key_to_cpu(leaf, &key, slot);

		if (key.type != BTRFS_INODE_ITEM_KEY)
			goto next;

105
		if (key.objectid >= root->highest_objectid)
106 107 108
			break;

		if (last != (u64)-1 && last + 1 != key.objectid) {
109
			__btrfs_add_free_space(fs_info, ctl, last + 1,
110
					       key.objectid - last - 1, 0);
111
			wake_up(&root->ino_cache_wait);
112 113 114 115 116 117 118
		}

		last = key.objectid;
next:
		path->slots[0]++;
	}

119
	if (last < root->highest_objectid - 1) {
120
		__btrfs_add_free_space(fs_info, ctl, last + 1,
121
				       root->highest_objectid - last - 1, 0);
122 123
	}

124 125 126
	spin_lock(&root->ino_cache_lock);
	root->ino_cache_state = BTRFS_CACHE_FINISHED;
	spin_unlock(&root->ino_cache_lock);
127

128
	root->ino_cache_progress = (u64)-1;
129 130
	btrfs_unpin_free_ino(root);
out:
131
	wake_up(&root->ino_cache_wait);
132
	up_read(&fs_info->commit_root_sem);
133 134 135 136 137 138 139 140

	btrfs_free_path(path);

	return ret;
}

static void start_caching(struct btrfs_root *root)
{
141
	struct btrfs_fs_info *fs_info = root->fs_info;
142
	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
143
	struct task_struct *tsk;
144
	int ret;
145
	u64 objectid;
146

147
	if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
C
Chris Mason 已提交
148 149
		return;

150 151 152
	spin_lock(&root->ino_cache_lock);
	if (root->ino_cache_state != BTRFS_CACHE_NO) {
		spin_unlock(&root->ino_cache_lock);
153 154 155
		return;
	}

156 157
	root->ino_cache_state = BTRFS_CACHE_STARTED;
	spin_unlock(&root->ino_cache_lock);
158

159
	ret = load_free_ino_cache(fs_info, root);
160
	if (ret == 1) {
161 162 163
		spin_lock(&root->ino_cache_lock);
		root->ino_cache_state = BTRFS_CACHE_FINISHED;
		spin_unlock(&root->ino_cache_lock);
164
		wake_up(&root->ino_cache_wait);
165 166 167
		return;
	}

168 169 170 171 172 173 174 175 176
	/*
	 * It can be quite time-consuming to fill the cache by searching
	 * through the extent tree, and this can keep ino allocation path
	 * waiting. Therefore at start we quickly find out the highest
	 * inode number and we know we can use inode numbers which fall in
	 * [highest_ino + 1, BTRFS_LAST_FREE_OBJECTID].
	 */
	ret = btrfs_find_free_objectid(root, &objectid);
	if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) {
177
		__btrfs_add_free_space(fs_info, ctl, objectid,
178 179
				       BTRFS_LAST_FREE_OBJECTID - objectid + 1,
				       0);
180
		wake_up(&root->ino_cache_wait);
181 182
	}

183
	tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu",
184
			  root->root_key.objectid);
185 186
	if (IS_ERR(tsk))
		fail_caching_thread(root);
187 188 189 190
}

int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
{
191
	if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
C
Chris Mason 已提交
192 193
		return btrfs_find_free_objectid(root, objectid);

194 195 196 197 198 199 200 201
again:
	*objectid = btrfs_find_ino_for_alloc(root);

	if (*objectid != 0)
		return 0;

	start_caching(root);

202 203
	wait_event(root->ino_cache_wait,
		   root->ino_cache_state == BTRFS_CACHE_FINISHED ||
204
		   root->ino_cache_state == BTRFS_CACHE_ERROR ||
205 206
		   root->free_ino_ctl->free_space > 0);

207
	if (root->ino_cache_state == BTRFS_CACHE_FINISHED &&
208 209
	    root->free_ino_ctl->free_space == 0)
		return -ENOSPC;
210 211
	else if (root->ino_cache_state == BTRFS_CACHE_ERROR)
		return btrfs_find_free_objectid(root, objectid);
212 213 214 215 216 217
	else
		goto again;
}

void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
{
218
	struct btrfs_fs_info *fs_info = root->fs_info;
219
	struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
C
Chris Mason 已提交
220

221
	if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
C
Chris Mason 已提交
222
		return;
223
again:
224
	if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
225
		__btrfs_add_free_space(fs_info, pinned, objectid, 1, 0);
226
	} else {
227
		down_write(&fs_info->commit_root_sem);
228 229 230
		spin_lock(&root->ino_cache_lock);
		if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
			spin_unlock(&root->ino_cache_lock);
231
			up_write(&fs_info->commit_root_sem);
232 233
			goto again;
		}
234
		spin_unlock(&root->ino_cache_lock);
235 236 237

		start_caching(root);

238
		__btrfs_add_free_space(fs_info, pinned, objectid, 1, 0);
239

240
		up_write(&fs_info->commit_root_sem);
241 242 243 244
	}
}

/*
245 246 247 248
 * When a transaction is committed, we'll move those inode numbers which are
 * smaller than root->ino_cache_progress from pinned tree to free_ino tree, and
 * others will just be dropped, because the commit root we were searching has
 * changed.
249
 *
250
 * Must be called with root->fs_info->commit_root_sem held
251 252 253 254 255
 */
void btrfs_unpin_free_ino(struct btrfs_root *root)
{
	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
	struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset;
256
	spinlock_t *rbroot_lock = &root->free_ino_pinned->tree_lock;
257 258 259 260
	struct btrfs_free_space *info;
	struct rb_node *n;
	u64 count;

261
	if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
C
Chris Mason 已提交
262 263
		return;

264
	while (1) {
265
		spin_lock(rbroot_lock);
266
		n = rb_first(rbroot);
267 268
		if (!n) {
			spin_unlock(rbroot_lock);
269
			break;
270
		}
271 272

		info = rb_entry(n, struct btrfs_free_space, offset_index);
273
		BUG_ON(info->bitmap); /* Logic error */
274

275
		if (info->offset > root->ino_cache_progress)
276
			count = 0;
277
		else
278 279
			count = min(root->ino_cache_progress - info->offset + 1,
				    info->bytes);
280 281

		rb_erase(&info->offset_index, rbroot);
282
		spin_unlock(rbroot_lock);
283
		if (count)
284
			__btrfs_add_free_space(root->fs_info, ctl,
285
					       info->offset, count, 0);
286
		kmem_cache_free(btrfs_free_space_cachep, info);
287 288 289
	}
}

290
#define INIT_THRESHOLD	((SZ_32K / 2) / sizeof(struct btrfs_free_space))
291
#define INODES_PER_BITMAP (PAGE_SIZE * 8)
292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324

/*
 * The goal is to keep the memory used by the free_ino tree won't
 * exceed the memory if we use bitmaps only.
 */
static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
{
	struct btrfs_free_space *info;
	struct rb_node *n;
	int max_ino;
	int max_bitmaps;

	n = rb_last(&ctl->free_space_offset);
	if (!n) {
		ctl->extents_thresh = INIT_THRESHOLD;
		return;
	}
	info = rb_entry(n, struct btrfs_free_space, offset_index);

	/*
	 * Find the maximum inode number in the filesystem. Note we
	 * ignore the fact that this can be a bitmap, because we are
	 * not doing precise calculation.
	 */
	max_ino = info->bytes - 1;

	max_bitmaps = ALIGN(max_ino, INODES_PER_BITMAP) / INODES_PER_BITMAP;
	if (max_bitmaps <= ctl->total_bitmaps) {
		ctl->extents_thresh = 0;
		return;
	}

	ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) *
325
				PAGE_SIZE / sizeof(*info);
326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341
}

/*
 * We don't fall back to bitmap, if we are below the extents threshold
 * or this chunk of inode numbers is a big one.
 */
static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
		       struct btrfs_free_space *info)
{
	if (ctl->free_extents < ctl->extents_thresh ||
	    info->bytes > INODES_PER_BITMAP / 10)
		return false;

	return true;
}

342
static const struct btrfs_free_space_op free_ino_op = {
343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363
	.recalc_thresholds	= recalculate_thresholds,
	.use_bitmap		= use_bitmap,
};

static void pinned_recalc_thresholds(struct btrfs_free_space_ctl *ctl)
{
}

static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl,
			      struct btrfs_free_space *info)
{
	/*
	 * We always use extents for two reasons:
	 *
	 * - The pinned tree is only used during the process of caching
	 *   work.
	 * - Make code simpler. See btrfs_unpin_free_ino().
	 */
	return false;
}

364
static const struct btrfs_free_space_op pinned_free_ino_op = {
365 366 367 368 369 370 371 372 373 374 375 376 377 378
	.recalc_thresholds	= pinned_recalc_thresholds,
	.use_bitmap		= pinned_use_bitmap,
};

void btrfs_init_free_ino_ctl(struct btrfs_root *root)
{
	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
	struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;

	spin_lock_init(&ctl->tree_lock);
	ctl->unit = 1;
	ctl->start = 0;
	ctl->private = NULL;
	ctl->op = &free_ino_op;
379 380
	INIT_LIST_HEAD(&ctl->trimming_ranges);
	mutex_init(&ctl->cache_writeout_mutex);
381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396

	/*
	 * Initially we allow to use 16K of ram to cache chunks of
	 * inode numbers before we resort to bitmaps. This is somewhat
	 * arbitrary, but it will be adjusted in runtime.
	 */
	ctl->extents_thresh = INIT_THRESHOLD;

	spin_lock_init(&pinned->tree_lock);
	pinned->unit = 1;
	pinned->start = 0;
	pinned->private = NULL;
	pinned->extents_thresh = 0;
	pinned->op = &pinned_free_ino_op;
}

397 398 399
int btrfs_save_ino_cache(struct btrfs_root *root,
			 struct btrfs_trans_handle *trans)
{
400
	struct btrfs_fs_info *fs_info = root->fs_info;
401 402 403
	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
	struct btrfs_path *path;
	struct inode *inode;
404
	struct btrfs_block_rsv *rsv;
405
	struct extent_changeset *data_reserved = NULL;
406
	u64 num_bytes;
407 408 409 410 411
	u64 alloc_hint = 0;
	int ret;
	int prealloc;
	bool retry = false;

412 413 414 415 416 417
	/* only fs tree and subvol/snap needs ino cache */
	if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID &&
	    (root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID ||
	     root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID))
		return 0;

418
	/* Don't save inode cache if we are deleting this root */
419
	if (btrfs_root_refs(&root->root_item) == 0)
420 421
		return 0;

422
	if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
C
Chris Mason 已提交
423 424
		return 0;

425 426 427
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
C
Chris Mason 已提交
428

429
	rsv = trans->block_rsv;
430
	trans->block_rsv = &fs_info->trans_block_rsv;
431 432 433 434

	num_bytes = trans->bytes_reserved;
	/*
	 * 1 item for inode item insertion if need
435 436
	 * 4 items for inode item update (in the worst case)
	 * 1 items for slack space if we need do truncation
437 438 439
	 * 1 item for free space object
	 * 3 items for pre-allocation
	 */
440
	trans->bytes_reserved = btrfs_calc_insert_metadata_size(fs_info, 10);
M
Miao Xie 已提交
441 442 443
	ret = btrfs_block_rsv_add(root, trans->block_rsv,
				  trans->bytes_reserved,
				  BTRFS_RESERVE_NO_FLUSH);
444 445
	if (ret)
		goto out;
446 447
	trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid,
				      trans->bytes_reserved, 1);
448 449
again:
	inode = lookup_free_ino_inode(root, path);
450
	if (IS_ERR(inode) && (PTR_ERR(inode) != -ENOENT || retry)) {
451
		ret = PTR_ERR(inode);
452
		goto out_release;
453 454 455
	}

	if (IS_ERR(inode)) {
456
		BUG_ON(retry); /* Logic error */
457 458 459 460
		retry = true;

		ret = create_free_ino_inode(root, trans, path);
		if (ret)
461
			goto out_release;
462 463 464 465
		goto again;
	}

	BTRFS_I(inode)->generation = 0;
466
	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
467
	if (ret) {
468
		btrfs_abort_transaction(trans, ret);
469 470
		goto out_put;
	}
471 472

	if (i_size_read(inode) > 0) {
473
		ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
474
		if (ret) {
475
			if (ret != -ENOSPC)
476
				btrfs_abort_transaction(trans, ret);
477
			goto out_put;
478
		}
479 480
	}

481 482
	spin_lock(&root->ino_cache_lock);
	if (root->ino_cache_state != BTRFS_CACHE_FINISHED) {
483
		ret = -1;
484
		spin_unlock(&root->ino_cache_lock);
485 486
		goto out_put;
	}
487
	spin_unlock(&root->ino_cache_lock);
488 489 490

	spin_lock(&ctl->tree_lock);
	prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents;
491 492
	prealloc = ALIGN(prealloc, PAGE_SIZE);
	prealloc += ctl->total_bitmaps * PAGE_SIZE;
493 494 495
	spin_unlock(&ctl->tree_lock);

	/* Just to make sure we have enough space */
496
	prealloc += 8 * PAGE_SIZE;
497

498 499
	ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, 0,
					   prealloc);
500 501 502 503 504
	if (ret)
		goto out_put;

	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
					      prealloc, prealloc, &alloc_hint);
505
	if (ret) {
506
		btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
507
		btrfs_delalloc_release_metadata(BTRFS_I(inode), prealloc, true);
508
		goto out_put;
509
	}
510

511
	ret = btrfs_write_out_ino_cache(root, trans, path, inode);
512
	btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
513 514
out_put:
	iput(inode);
515
out_release:
516 517
	trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid,
				      trans->bytes_reserved, 0);
518
	btrfs_block_rsv_release(fs_info, trans->block_rsv,
519
				trans->bytes_reserved, NULL);
520
out:
521 522
	trans->block_rsv = rsv;
	trans->bytes_reserved = num_bytes;
523 524

	btrfs_free_path(path);
525
	extent_changeset_free(data_reserved);
526 527
	return ret;
}