free-space-cache.c 107.5 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
J
Josef Bacik 已提交
2 3 4 5
/*
 * Copyright (C) 2008 Red Hat.  All rights reserved.
 */

6
#include <linux/pagemap.h>
J
Josef Bacik 已提交
7
#include <linux/sched.h>
8
#include <linux/sched/signal.h>
9
#include <linux/slab.h>
10
#include <linux/math64.h>
11
#include <linux/ratelimit.h>
12
#include <linux/error-injection.h>
13
#include <linux/sched/mm.h>
14
#include "misc.h"
J
Josef Bacik 已提交
15
#include "ctree.h"
16 17
#include "free-space-cache.h"
#include "transaction.h"
18
#include "disk-io.h"
19
#include "extent_io.h"
20
#include "volumes.h"
21
#include "space-info.h"
22
#include "delalloc-space.h"
23
#include "block-group.h"
24
#include "discard.h"
25

26
#define BITS_PER_BITMAP		(PAGE_SIZE * 8UL)
27 28
#define MAX_CACHE_BYTES_PER_GIG	SZ_64K
#define FORCE_EXTENT_THRESHOLD	SZ_1M
J
Josef Bacik 已提交
29

30 31 32 33 34 35
struct btrfs_trim_range {
	u64 start;
	u64 bytes;
	struct list_head list;
};

36
static int link_free_space(struct btrfs_free_space_ctl *ctl,
J
Josef Bacik 已提交
37
			   struct btrfs_free_space *info);
38 39
static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
			      struct btrfs_free_space *info);
40 41 42 43 44 45 46 47
static int search_bitmap(struct btrfs_free_space_ctl *ctl,
			 struct btrfs_free_space *bitmap_info, u64 *offset,
			 u64 *bytes, bool for_alloc);
static void free_bitmap(struct btrfs_free_space_ctl *ctl,
			struct btrfs_free_space *bitmap_info);
static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
			      struct btrfs_free_space *info, u64 offset,
			      u64 bytes);
J
Josef Bacik 已提交
48

49 50 51
static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
					       struct btrfs_path *path,
					       u64 offset)
52
{
53
	struct btrfs_fs_info *fs_info = root->fs_info;
54 55 56 57 58 59
	struct btrfs_key key;
	struct btrfs_key location;
	struct btrfs_disk_key disk_key;
	struct btrfs_free_space_header *header;
	struct extent_buffer *leaf;
	struct inode *inode = NULL;
60
	unsigned nofs_flag;
61 62 63
	int ret;

	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
64
	key.offset = offset;
65 66 67 68 69 70
	key.type = 0;

	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0)
		return ERR_PTR(ret);
	if (ret > 0) {
71
		btrfs_release_path(path);
72 73 74 75 76 77 78 79
		return ERR_PTR(-ENOENT);
	}

	leaf = path->nodes[0];
	header = btrfs_item_ptr(leaf, path->slots[0],
				struct btrfs_free_space_header);
	btrfs_free_space_key(leaf, header, &disk_key);
	btrfs_disk_key_to_cpu(&location, &disk_key);
80
	btrfs_release_path(path);
81

82 83 84 85 86
	/*
	 * We are often under a trans handle at this point, so we need to make
	 * sure NOFS is set to keep us from deadlocking.
	 */
	nofs_flag = memalloc_nofs_save();
D
David Sterba 已提交
87
	inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path);
88
	btrfs_release_path(path);
89
	memalloc_nofs_restore(nofs_flag);
90 91 92
	if (IS_ERR(inode))
		return inode;

A
Al Viro 已提交
93
	mapping_set_gfp_mask(inode->i_mapping,
94 95
			mapping_gfp_constraint(inode->i_mapping,
			~(__GFP_FS | __GFP_HIGHMEM)));
96

97 98 99
	return inode;
}

100
struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
101
		struct btrfs_path *path)
102
{
103
	struct btrfs_fs_info *fs_info = block_group->fs_info;
104
	struct inode *inode = NULL;
105
	u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
106 107 108 109 110 111 112 113

	spin_lock(&block_group->lock);
	if (block_group->inode)
		inode = igrab(block_group->inode);
	spin_unlock(&block_group->lock);
	if (inode)
		return inode;

114
	inode = __lookup_free_space_inode(fs_info->tree_root, path,
115
					  block_group->start);
116 117 118
	if (IS_ERR(inode))
		return inode;

119
	spin_lock(&block_group->lock);
120
	if (!((BTRFS_I(inode)->flags & flags) == flags)) {
121
		btrfs_info(fs_info, "Old style space inode found, converting.");
122 123
		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
			BTRFS_INODE_NODATACOW;
124 125 126
		block_group->disk_cache_state = BTRFS_DC_CLEAR;
	}

127
	if (!block_group->iref) {
128 129 130 131 132 133 134 135
		block_group->inode = igrab(inode);
		block_group->iref = 1;
	}
	spin_unlock(&block_group->lock);

	return inode;
}

136 137 138 139
static int __create_free_space_inode(struct btrfs_root *root,
				     struct btrfs_trans_handle *trans,
				     struct btrfs_path *path,
				     u64 ino, u64 offset)
140 141 142 143 144 145
{
	struct btrfs_key key;
	struct btrfs_disk_key disk_key;
	struct btrfs_free_space_header *header;
	struct btrfs_inode_item *inode_item;
	struct extent_buffer *leaf;
146 147 148
	/* We inline CRCs for the free disk space cache */
	const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC |
			  BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
149 150
	int ret;

151
	ret = btrfs_insert_empty_inode(trans, root, path, ino);
152 153 154 155 156 157 158
	if (ret)
		return ret;

	leaf = path->nodes[0];
	inode_item = btrfs_item_ptr(leaf, path->slots[0],
				    struct btrfs_inode_item);
	btrfs_item_key(leaf, &disk_key, path->slots[0]);
159
	memzero_extent_buffer(leaf, (unsigned long)inode_item,
160 161 162 163 164 165 166
			     sizeof(*inode_item));
	btrfs_set_inode_generation(leaf, inode_item, trans->transid);
	btrfs_set_inode_size(leaf, inode_item, 0);
	btrfs_set_inode_nbytes(leaf, inode_item, 0);
	btrfs_set_inode_uid(leaf, inode_item, 0);
	btrfs_set_inode_gid(leaf, inode_item, 0);
	btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
167
	btrfs_set_inode_flags(leaf, inode_item, flags);
168 169
	btrfs_set_inode_nlink(leaf, inode_item, 1);
	btrfs_set_inode_transid(leaf, inode_item, trans->transid);
170
	btrfs_set_inode_block_group(leaf, inode_item, offset);
171
	btrfs_mark_buffer_dirty(leaf);
172
	btrfs_release_path(path);
173 174

	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
175
	key.offset = offset;
176 177 178 179
	key.type = 0;
	ret = btrfs_insert_empty_item(trans, root, path, &key,
				      sizeof(struct btrfs_free_space_header));
	if (ret < 0) {
180
		btrfs_release_path(path);
181 182
		return ret;
	}
183

184 185 186
	leaf = path->nodes[0];
	header = btrfs_item_ptr(leaf, path->slots[0],
				struct btrfs_free_space_header);
187
	memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
188 189
	btrfs_set_free_space_key(leaf, header, &disk_key);
	btrfs_mark_buffer_dirty(leaf);
190
	btrfs_release_path(path);
191 192 193 194

	return 0;
}

195
int create_free_space_inode(struct btrfs_trans_handle *trans,
196
			    struct btrfs_block_group *block_group,
197 198 199 200 201
			    struct btrfs_path *path)
{
	int ret;
	u64 ino;

202
	ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino);
203 204 205
	if (ret < 0)
		return ret;

206
	return __create_free_space_inode(trans->fs_info->tree_root, trans, path,
207
					 ino, block_group->start);
208 209
}

210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268
/*
 * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode
 * handles lookup, otherwise it takes ownership and iputs the inode.
 * Don't reuse an inode pointer after passing it into this function.
 */
int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
				  struct inode *inode,
				  struct btrfs_block_group *block_group)
{
	struct btrfs_path *path;
	struct btrfs_key key;
	int ret = 0;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	if (!inode)
		inode = lookup_free_space_inode(block_group, path);
	if (IS_ERR(inode)) {
		if (PTR_ERR(inode) != -ENOENT)
			ret = PTR_ERR(inode);
		goto out;
	}
	ret = btrfs_orphan_add(trans, BTRFS_I(inode));
	if (ret) {
		btrfs_add_delayed_iput(inode);
		goto out;
	}
	clear_nlink(inode);
	/* One for the block groups ref */
	spin_lock(&block_group->lock);
	if (block_group->iref) {
		block_group->iref = 0;
		block_group->inode = NULL;
		spin_unlock(&block_group->lock);
		iput(inode);
	} else {
		spin_unlock(&block_group->lock);
	}
	/* One for the lookup ref */
	btrfs_add_delayed_iput(inode);

	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
	key.type = 0;
	key.offset = block_group->start;
	ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path,
				-1, 1);
	if (ret) {
		if (ret > 0)
			ret = 0;
		goto out;
	}
	ret = btrfs_del_item(trans, trans->fs_info->tree_root, path);
out:
	btrfs_free_path(path);
	return ret;
}

269
int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
270
				       struct btrfs_block_rsv *rsv)
271
{
272
	u64 needed_bytes;
273
	int ret;
274 275

	/* 1 for slack space, 1 for updating the inode */
276 277
	needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) +
		btrfs_calc_metadata_size(fs_info, 1);
278

279 280 281 282 283 284
	spin_lock(&rsv->lock);
	if (rsv->reserved < needed_bytes)
		ret = -ENOSPC;
	else
		ret = 0;
	spin_unlock(&rsv->lock);
285
	return ret;
286 287
}

288
int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
289
				    struct btrfs_block_group *block_group,
290 291
				    struct inode *inode)
{
292
	struct btrfs_root *root = BTRFS_I(inode)->root;
293
	int ret = 0;
294
	bool locked = false;
295 296

	if (block_group) {
297 298 299 300 301 302
		struct btrfs_path *path = btrfs_alloc_path();

		if (!path) {
			ret = -ENOMEM;
			goto fail;
		}
303
		locked = true;
304 305 306 307
		mutex_lock(&trans->transaction->cache_write_mutex);
		if (!list_empty(&block_group->io_list)) {
			list_del_init(&block_group->io_list);

308
			btrfs_wait_cache_io(trans, block_group, path);
309 310 311 312 313 314 315 316 317 318
			btrfs_put_block_group(block_group);
		}

		/*
		 * now that we've truncated the cache away, its no longer
		 * setup or written
		 */
		spin_lock(&block_group->lock);
		block_group->disk_cache_state = BTRFS_DC_CLEAR;
		spin_unlock(&block_group->lock);
319
		btrfs_free_path(path);
320
	}
321

322
	btrfs_i_size_write(BTRFS_I(inode), 0);
323
	truncate_pagecache(inode, 0);
324 325

	/*
326 327
	 * We skip the throttling logic for free space cache inodes, so we don't
	 * need to check for -EAGAIN.
328
	 */
329
	ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
330
					 0, BTRFS_EXTENT_DATA_KEY);
331 332
	if (ret)
		goto fail;
333

334
	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
335 336

fail:
337 338
	if (locked)
		mutex_unlock(&trans->transaction->cache_write_mutex);
339
	if (ret)
340
		btrfs_abort_transaction(trans, ret);
341

342
	return ret;
343 344
}

345
static void readahead_cache(struct inode *inode)
346 347 348 349 350 351
{
	struct file_ra_state *ra;
	unsigned long last_index;

	ra = kzalloc(sizeof(*ra), GFP_NOFS);
	if (!ra)
352
		return;
353 354

	file_ra_state_init(ra, inode->i_mapping);
355
	last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
356 357 358 359 360 361

	page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);

	kfree(ra);
}

362
static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
363
		       int write)
364
{
365 366
	int num_pages;

367
	num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
368

369
	/* Make sure we can fit our crcs and generation into the first page */
370
	if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
371 372
		return -ENOSPC;

373
	memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
374

375
	io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
376 377
	if (!io_ctl->pages)
		return -ENOMEM;
378 379

	io_ctl->num_pages = num_pages;
380
	io_ctl->fs_info = btrfs_sb(inode->i_sb);
381
	io_ctl->inode = inode;
382

383 384
	return 0;
}
385
ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO);
386

387
static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
388 389
{
	kfree(io_ctl->pages);
390
	io_ctl->pages = NULL;
391 392
}

393
static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
394 395 396 397 398 399 400
{
	if (io_ctl->cur) {
		io_ctl->cur = NULL;
		io_ctl->orig = NULL;
	}
}

401
static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
402
{
403
	ASSERT(io_ctl->index < io_ctl->num_pages);
404
	io_ctl->page = io_ctl->pages[io_ctl->index++];
405
	io_ctl->cur = page_address(io_ctl->page);
406
	io_ctl->orig = io_ctl->cur;
407
	io_ctl->size = PAGE_SIZE;
408
	if (clear)
409
		clear_page(io_ctl->cur);
410 411
}

412
static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
413 414 415 416 417 418
{
	int i;

	io_ctl_unmap_page(io_ctl);

	for (i = 0; i < io_ctl->num_pages; i++) {
419 420 421
		if (io_ctl->pages[i]) {
			ClearPageChecked(io_ctl->pages[i]);
			unlock_page(io_ctl->pages[i]);
422
			put_page(io_ctl->pages[i]);
423
		}
424 425 426
	}
}

427
static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
428 429
{
	struct page *page;
430
	struct inode *inode = io_ctl->inode;
431 432 433 434
	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
	int i;

	for (i = 0; i < io_ctl->num_pages; i++) {
435 436
		int ret;

437 438 439 440 441
		page = find_or_create_page(inode->i_mapping, i, mask);
		if (!page) {
			io_ctl_drop_pages(io_ctl);
			return -ENOMEM;
		}
442 443 444 445 446 447 448 449 450

		ret = set_page_extent_mapped(page);
		if (ret < 0) {
			unlock_page(page);
			put_page(page);
			io_ctl_drop_pages(io_ctl);
			return ret;
		}

451 452 453 454
		io_ctl->pages[i] = page;
		if (uptodate && !PageUptodate(page)) {
			btrfs_readpage(NULL, page);
			lock_page(page);
455 456 457 458 459 460
			if (page->mapping != inode->i_mapping) {
				btrfs_err(BTRFS_I(inode)->root->fs_info,
					  "free space cache page truncated");
				io_ctl_drop_pages(io_ctl);
				return -EIO;
			}
461
			if (!PageUptodate(page)) {
462 463
				btrfs_err(BTRFS_I(inode)->root->fs_info,
					   "error reading free space cache");
464 465 466 467 468 469
				io_ctl_drop_pages(io_ctl);
				return -EIO;
			}
		}
	}

470
	for (i = 0; i < io_ctl->num_pages; i++)
471 472
		clear_page_dirty_for_io(io_ctl->pages[i]);

473 474 475
	return 0;
}

476
static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
477 478 479 480
{
	io_ctl_map_page(io_ctl, 1);

	/*
481 482
	 * Skip the csum areas.  If we don't check crcs then we just have a
	 * 64bit chunk at the front of the first page.
483
	 */
484 485
	io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
	io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
486

487
	put_unaligned_le64(generation, io_ctl->cur);
488 489 490
	io_ctl->cur += sizeof(u64);
}

491
static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
492
{
493
	u64 cache_gen;
494

495 496 497 498
	/*
	 * Skip the crc area.  If we don't check crcs then we just have a 64bit
	 * chunk at the front of the first page.
	 */
499 500
	io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
	io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
501

502 503
	cache_gen = get_unaligned_le64(io_ctl->cur);
	if (cache_gen != generation) {
504
		btrfs_err_rl(io_ctl->fs_info,
505
			"space cache generation (%llu) does not match inode (%llu)",
506
				cache_gen, generation);
507 508 509 510
		io_ctl_unmap_page(io_ctl);
		return -EIO;
	}
	io_ctl->cur += sizeof(u64);
511 512 513
	return 0;
}

514
static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
515 516 517 518 519 520
{
	u32 *tmp;
	u32 crc = ~(u32)0;
	unsigned offset = 0;

	if (index == 0)
521
		offset = sizeof(u32) * io_ctl->num_pages;
522

523 524
	crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
	btrfs_crc32c_final(crc, (u8 *)&crc);
525
	io_ctl_unmap_page(io_ctl);
526
	tmp = page_address(io_ctl->pages[0]);
527 528 529 530
	tmp += index;
	*tmp = crc;
}

531
static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
532 533 534 535 536 537 538 539
{
	u32 *tmp, val;
	u32 crc = ~(u32)0;
	unsigned offset = 0;

	if (index == 0)
		offset = sizeof(u32) * io_ctl->num_pages;

540
	tmp = page_address(io_ctl->pages[0]);
541 542 543 544
	tmp += index;
	val = *tmp;

	io_ctl_map_page(io_ctl, 0);
545 546
	crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
	btrfs_crc32c_final(crc, (u8 *)&crc);
547
	if (val != crc) {
548
		btrfs_err_rl(io_ctl->fs_info,
549
			"csum mismatch on free space cache");
550 551 552 553
		io_ctl_unmap_page(io_ctl);
		return -EIO;
	}

554 555 556
	return 0;
}

557
static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
558 559 560 561 562 563 564 565
			    void *bitmap)
{
	struct btrfs_free_space_entry *entry;

	if (!io_ctl->cur)
		return -ENOSPC;

	entry = io_ctl->cur;
566 567
	put_unaligned_le64(offset, &entry->offset);
	put_unaligned_le64(bytes, &entry->bytes);
568 569 570 571 572 573 574 575
	entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
		BTRFS_FREE_SPACE_EXTENT;
	io_ctl->cur += sizeof(struct btrfs_free_space_entry);
	io_ctl->size -= sizeof(struct btrfs_free_space_entry);

	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
		return 0;

576
	io_ctl_set_crc(io_ctl, io_ctl->index - 1);
577 578 579 580 581 582 583 584 585 586

	/* No more pages to map */
	if (io_ctl->index >= io_ctl->num_pages)
		return 0;

	/* map the next page */
	io_ctl_map_page(io_ctl, 1);
	return 0;
}

587
static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
588 589 590 591 592 593 594 595 596
{
	if (!io_ctl->cur)
		return -ENOSPC;

	/*
	 * If we aren't at the start of the current page, unmap this one and
	 * map the next one if there is any left.
	 */
	if (io_ctl->cur != io_ctl->orig) {
597
		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
598 599 600 601 602
		if (io_ctl->index >= io_ctl->num_pages)
			return -ENOSPC;
		io_ctl_map_page(io_ctl, 0);
	}

603
	copy_page(io_ctl->cur, bitmap);
604
	io_ctl_set_crc(io_ctl, io_ctl->index - 1);
605 606 607 608 609
	if (io_ctl->index < io_ctl->num_pages)
		io_ctl_map_page(io_ctl, 0);
	return 0;
}

610
static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
611
{
612 613 614 615 616 617 618 619
	/*
	 * If we're not on the boundary we know we've modified the page and we
	 * need to crc the page.
	 */
	if (io_ctl->cur != io_ctl->orig)
		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
	else
		io_ctl_unmap_page(io_ctl);
620 621 622

	while (io_ctl->index < io_ctl->num_pages) {
		io_ctl_map_page(io_ctl, 1);
623
		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
624 625 626
	}
}

627
static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
628
			    struct btrfs_free_space *entry, u8 *type)
629 630
{
	struct btrfs_free_space_entry *e;
631 632 633 634 635 636 637
	int ret;

	if (!io_ctl->cur) {
		ret = io_ctl_check_crc(io_ctl, io_ctl->index);
		if (ret)
			return ret;
	}
638 639

	e = io_ctl->cur;
640 641
	entry->offset = get_unaligned_le64(&e->offset);
	entry->bytes = get_unaligned_le64(&e->bytes);
642
	*type = e->type;
643 644 645 646
	io_ctl->cur += sizeof(struct btrfs_free_space_entry);
	io_ctl->size -= sizeof(struct btrfs_free_space_entry);

	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
647
		return 0;
648 649 650

	io_ctl_unmap_page(io_ctl);

651
	return 0;
652 653
}

654
static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
655
			      struct btrfs_free_space *entry)
656
{
657 658 659 660 661 662
	int ret;

	ret = io_ctl_check_crc(io_ctl, io_ctl->index);
	if (ret)
		return ret;

663
	copy_page(entry->bitmap, io_ctl->cur);
664
	io_ctl_unmap_page(io_ctl);
665 666

	return 0;
667 668
}

669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706
static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
{
	struct btrfs_block_group *block_group = ctl->private;
	u64 max_bytes;
	u64 bitmap_bytes;
	u64 extent_bytes;
	u64 size = block_group->length;
	u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
	u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);

	max_bitmaps = max_t(u64, max_bitmaps, 1);

	ASSERT(ctl->total_bitmaps <= max_bitmaps);

	/*
	 * We are trying to keep the total amount of memory used per 1GiB of
	 * space to be MAX_CACHE_BYTES_PER_GIG.  However, with a reclamation
	 * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of
	 * bitmaps, we may end up using more memory than this.
	 */
	if (size < SZ_1G)
		max_bytes = MAX_CACHE_BYTES_PER_GIG;
	else
		max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);

	bitmap_bytes = ctl->total_bitmaps * ctl->unit;

	/*
	 * we want the extent entry threshold to always be at most 1/2 the max
	 * bytes we can have, or whatever is less than that.
	 */
	extent_bytes = max_bytes - bitmap_bytes;
	extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);

	ctl->extents_thresh =
		div_u64(extent_bytes, sizeof(struct btrfs_free_space));
}

707 708 709
static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
				   struct btrfs_free_space_ctl *ctl,
				   struct btrfs_path *path, u64 offset)
710
{
711
	struct btrfs_fs_info *fs_info = root->fs_info;
712 713
	struct btrfs_free_space_header *header;
	struct extent_buffer *leaf;
714
	struct btrfs_io_ctl io_ctl;
715
	struct btrfs_key key;
716
	struct btrfs_free_space *e, *n;
717
	LIST_HEAD(bitmaps);
718 719 720
	u64 num_entries;
	u64 num_bitmaps;
	u64 generation;
721
	u8 type;
722
	int ret = 0;
723 724

	/* Nothing in the space cache, goodbye */
725
	if (!i_size_read(inode))
726
		return 0;
727 728

	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
729
	key.offset = offset;
730 731 732
	key.type = 0;

	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
733
	if (ret < 0)
734
		return 0;
735
	else if (ret > 0) {
736
		btrfs_release_path(path);
737
		return 0;
738 739
	}

740 741
	ret = -1;

742 743 744 745 746 747
	leaf = path->nodes[0];
	header = btrfs_item_ptr(leaf, path->slots[0],
				struct btrfs_free_space_header);
	num_entries = btrfs_free_space_entries(leaf, header);
	num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
	generation = btrfs_free_space_generation(leaf, header);
748
	btrfs_release_path(path);
749

750
	if (!BTRFS_I(inode)->generation) {
751
		btrfs_info(fs_info,
752
			   "the free space cache file (%llu) is invalid, skip it",
753 754 755 756
			   offset);
		return 0;
	}

757
	if (BTRFS_I(inode)->generation != generation) {
758 759 760
		btrfs_err(fs_info,
			  "free space inode generation (%llu) did not match free space cache generation (%llu)",
			  BTRFS_I(inode)->generation, generation);
761
		return 0;
762 763 764
	}

	if (!num_entries)
765
		return 0;
766

767
	ret = io_ctl_init(&io_ctl, inode, 0);
768 769 770
	if (ret)
		return ret;

771
	readahead_cache(inode);
772

773
	ret = io_ctl_prepare_pages(&io_ctl, true);
774 775
	if (ret)
		goto out;
776

777 778 779 780
	ret = io_ctl_check_crc(&io_ctl, 0);
	if (ret)
		goto free_cache;

781 782 783
	ret = io_ctl_check_generation(&io_ctl, generation);
	if (ret)
		goto free_cache;
784

785 786 787
	while (num_entries) {
		e = kmem_cache_zalloc(btrfs_free_space_cachep,
				      GFP_NOFS);
788 789
		if (!e) {
			ret = -ENOMEM;
790
			goto free_cache;
791
		}
792

793 794 795 796 797 798
		ret = io_ctl_read_entry(&io_ctl, e, &type);
		if (ret) {
			kmem_cache_free(btrfs_free_space_cachep, e);
			goto free_cache;
		}

799
		if (!e->bytes) {
800
			ret = -1;
801 802
			kmem_cache_free(btrfs_free_space_cachep, e);
			goto free_cache;
803
		}
804 805 806 807 808 809

		if (type == BTRFS_FREE_SPACE_EXTENT) {
			spin_lock(&ctl->tree_lock);
			ret = link_free_space(ctl, e);
			spin_unlock(&ctl->tree_lock);
			if (ret) {
810
				btrfs_err(fs_info,
811
					"Duplicate entries in free space cache, dumping");
812
				kmem_cache_free(btrfs_free_space_cachep, e);
813 814
				goto free_cache;
			}
815
		} else {
816
			ASSERT(num_bitmaps);
817
			num_bitmaps--;
818 819
			e->bitmap = kmem_cache_zalloc(
					btrfs_free_space_bitmap_cachep, GFP_NOFS);
820
			if (!e->bitmap) {
821
				ret = -ENOMEM;
822 823
				kmem_cache_free(
					btrfs_free_space_cachep, e);
824 825
				goto free_cache;
			}
826 827 828
			spin_lock(&ctl->tree_lock);
			ret = link_free_space(ctl, e);
			ctl->total_bitmaps++;
829
			recalculate_thresholds(ctl);
830 831
			spin_unlock(&ctl->tree_lock);
			if (ret) {
832
				btrfs_err(fs_info,
833
					"Duplicate entries in free space cache, dumping");
834
				kmem_cache_free(btrfs_free_space_cachep, e);
835 836
				goto free_cache;
			}
837
			list_add_tail(&e->list, &bitmaps);
838 839
		}

840 841
		num_entries--;
	}
842

843 844
	io_ctl_unmap_page(&io_ctl);

845 846 847 848 849
	/*
	 * We add the bitmaps at the end of the entries in order that
	 * the bitmap entries are added to the cache.
	 */
	list_for_each_entry_safe(e, n, &bitmaps, list) {
850
		list_del_init(&e->list);
851 852 853
		ret = io_ctl_read_bitmap(&io_ctl, e);
		if (ret)
			goto free_cache;
854 855
	}

856
	io_ctl_drop_pages(&io_ctl);
857 858
	ret = 1;
out:
859
	io_ctl_free(&io_ctl);
860 861
	return ret;
free_cache:
862
	io_ctl_drop_pages(&io_ctl);
863
	__btrfs_remove_free_space_cache(ctl);
864 865 866
	goto out;
}

867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901
static int copy_free_space_cache(struct btrfs_block_group *block_group,
				 struct btrfs_free_space_ctl *ctl)
{
	struct btrfs_free_space *info;
	struct rb_node *n;
	int ret = 0;

	while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) {
		info = rb_entry(n, struct btrfs_free_space, offset_index);
		if (!info->bitmap) {
			unlink_free_space(ctl, info);
			ret = btrfs_add_free_space(block_group, info->offset,
						   info->bytes);
			kmem_cache_free(btrfs_free_space_cachep, info);
		} else {
			u64 offset = info->offset;
			u64 bytes = ctl->unit;

			while (search_bitmap(ctl, info, &offset, &bytes,
					     false) == 0) {
				ret = btrfs_add_free_space(block_group, offset,
							   bytes);
				if (ret)
					break;
				bitmap_clear_bits(ctl, info, offset, bytes);
				offset = info->offset;
				bytes = ctl->unit;
			}
			free_bitmap(ctl, info);
		}
		cond_resched();
	}
	return ret;
}

902
int load_free_space_cache(struct btrfs_block_group *block_group)
J
Josef Bacik 已提交
903
{
904
	struct btrfs_fs_info *fs_info = block_group->fs_info;
905
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
906
	struct btrfs_free_space_ctl tmp_ctl = {};
907 908
	struct inode *inode;
	struct btrfs_path *path;
909
	int ret = 0;
910
	bool matched;
911
	u64 used = block_group->used;
912

913 914 915 916 917 918 919
	/*
	 * Because we could potentially discard our loaded free space, we want
	 * to load everything into a temporary structure first, and then if it's
	 * valid copy it all into the actual free space ctl.
	 */
	btrfs_init_free_space_ctl(block_group, &tmp_ctl);

920 921 922 923
	/*
	 * If this block group has been marked to be cleared for one reason or
	 * another then we can't trust the on disk cache, so just return.
	 */
924
	spin_lock(&block_group->lock);
925 926 927 928
	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
		spin_unlock(&block_group->lock);
		return 0;
	}
929
	spin_unlock(&block_group->lock);
930 931 932 933

	path = btrfs_alloc_path();
	if (!path)
		return 0;
934 935
	path->search_commit_root = 1;
	path->skip_locking = 1;
936

937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955
	/*
	 * We must pass a path with search_commit_root set to btrfs_iget in
	 * order to avoid a deadlock when allocating extents for the tree root.
	 *
	 * When we are COWing an extent buffer from the tree root, when looking
	 * for a free extent, at extent-tree.c:find_free_extent(), we can find
	 * block group without its free space cache loaded. When we find one
	 * we must load its space cache which requires reading its free space
	 * cache's inode item from the root tree. If this inode item is located
	 * in the same leaf that we started COWing before, then we end up in
	 * deadlock on the extent buffer (trying to read lock it when we
	 * previously write locked it).
	 *
	 * It's safe to read the inode item using the commit root because
	 * block groups, once loaded, stay in memory forever (until they are
	 * removed) as well as their space caches once loaded. New block groups
	 * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
	 * we will never try to read their inode item while the fs is mounted.
	 */
956
	inode = lookup_free_space_inode(block_group, path);
957 958 959 960 961
	if (IS_ERR(inode)) {
		btrfs_free_path(path);
		return 0;
	}

962 963 964 965
	/* We may have converted the inode and made the cache invalid. */
	spin_lock(&block_group->lock);
	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
		spin_unlock(&block_group->lock);
966
		btrfs_free_path(path);
967 968 969 970
		goto out;
	}
	spin_unlock(&block_group->lock);

971
	ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl,
972
				      path, block_group->start);
973 974 975 976
	btrfs_free_path(path);
	if (ret <= 0)
		goto out;

977 978
	matched = (tmp_ctl.free_space == (block_group->length - used -
					  block_group->bytes_super));
979

980 981 982 983 984 985 986 987 988 989
	if (matched) {
		ret = copy_free_space_cache(block_group, &tmp_ctl);
		/*
		 * ret == 1 means we successfully loaded the free space cache,
		 * so we need to re-set it here.
		 */
		if (ret == 0)
			ret = 1;
	} else {
		__btrfs_remove_free_space_cache(&tmp_ctl);
J
Jeff Mahoney 已提交
990 991
		btrfs_warn(fs_info,
			   "block group %llu has wrong amount of free space",
992
			   block_group->start);
993 994 995 996 997 998 999 1000
		ret = -1;
	}
out:
	if (ret < 0) {
		/* This cache is bogus, make sure it gets cleared */
		spin_lock(&block_group->lock);
		block_group->disk_cache_state = BTRFS_DC_CLEAR;
		spin_unlock(&block_group->lock);
1001
		ret = 0;
1002

J
Jeff Mahoney 已提交
1003 1004
		btrfs_warn(fs_info,
			   "failed to load free space cache for block group %llu, rebuilding it now",
1005
			   block_group->start);
1006 1007
	}

1008 1009 1010
	spin_lock(&ctl->tree_lock);
	btrfs_discard_update_discardable(block_group);
	spin_unlock(&ctl->tree_lock);
1011 1012
	iput(inode);
	return ret;
1013 1014
}

1015
static noinline_for_stack
1016
int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
1017
			      struct btrfs_free_space_ctl *ctl,
1018
			      struct btrfs_block_group *block_group,
1019 1020
			      int *entries, int *bitmaps,
			      struct list_head *bitmap_list)
J
Josef Bacik 已提交
1021
{
1022
	int ret;
1023
	struct btrfs_free_cluster *cluster = NULL;
1024
	struct btrfs_free_cluster *cluster_locked = NULL;
1025
	struct rb_node *node = rb_first(&ctl->free_space_offset);
1026
	struct btrfs_trim_range *trim_entry;
1027

1028
	/* Get the cluster for this block_group if it exists */
1029
	if (block_group && !list_empty(&block_group->cluster_list)) {
1030 1031 1032
		cluster = list_entry(block_group->cluster_list.next,
				     struct btrfs_free_cluster,
				     block_group_list);
1033
	}
1034

1035
	if (!node && cluster) {
1036 1037
		cluster_locked = cluster;
		spin_lock(&cluster_locked->lock);
1038 1039 1040 1041
		node = rb_first(&cluster->root);
		cluster = NULL;
	}

1042 1043 1044
	/* Write out the extent entries */
	while (node) {
		struct btrfs_free_space *e;
J
Josef Bacik 已提交
1045

1046
		e = rb_entry(node, struct btrfs_free_space, offset_index);
1047
		*entries += 1;
J
Josef Bacik 已提交
1048

1049
		ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
1050 1051
				       e->bitmap);
		if (ret)
1052
			goto fail;
1053

1054
		if (e->bitmap) {
1055 1056
			list_add_tail(&e->list, bitmap_list);
			*bitmaps += 1;
1057
		}
1058 1059 1060
		node = rb_next(node);
		if (!node && cluster) {
			node = rb_first(&cluster->root);
1061 1062
			cluster_locked = cluster;
			spin_lock(&cluster_locked->lock);
1063
			cluster = NULL;
1064
		}
1065
	}
1066 1067 1068 1069
	if (cluster_locked) {
		spin_unlock(&cluster_locked->lock);
		cluster_locked = NULL;
	}
1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084

	/*
	 * Make sure we don't miss any range that was removed from our rbtree
	 * because trimming is running. Otherwise after a umount+mount (or crash
	 * after committing the transaction) we would leak free space and get
	 * an inconsistent free space cache report from fsck.
	 */
	list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
		ret = io_ctl_add_entry(io_ctl, trim_entry->start,
				       trim_entry->bytes, NULL);
		if (ret)
			goto fail;
		*entries += 1;
	}

1085 1086
	return 0;
fail:
1087 1088
	if (cluster_locked)
		spin_unlock(&cluster_locked->lock);
1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110
	return -ENOSPC;
}

static noinline_for_stack int
update_cache_item(struct btrfs_trans_handle *trans,
		  struct btrfs_root *root,
		  struct inode *inode,
		  struct btrfs_path *path, u64 offset,
		  int entries, int bitmaps)
{
	struct btrfs_key key;
	struct btrfs_free_space_header *header;
	struct extent_buffer *leaf;
	int ret;

	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
	key.offset = offset;
	key.type = 0;

	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
	if (ret < 0) {
		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1111
				 EXTENT_DELALLOC, 0, 0, NULL);
1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122
		goto fail;
	}
	leaf = path->nodes[0];
	if (ret > 0) {
		struct btrfs_key found_key;
		ASSERT(path->slots[0]);
		path->slots[0]--;
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
		if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
		    found_key.offset != offset) {
			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
1123 1124
					 inode->i_size - 1, EXTENT_DELALLOC, 0,
					 0, NULL);
1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144
			btrfs_release_path(path);
			goto fail;
		}
	}

	BTRFS_I(inode)->generation = trans->transid;
	header = btrfs_item_ptr(leaf, path->slots[0],
				struct btrfs_free_space_header);
	btrfs_set_free_space_entries(leaf, header, entries);
	btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
	btrfs_set_free_space_generation(leaf, header, trans->transid);
	btrfs_mark_buffer_dirty(leaf);
	btrfs_release_path(path);

	return 0;

fail:
	return -1;
}

1145
static noinline_for_stack int write_pinned_extent_entries(
1146
			    struct btrfs_trans_handle *trans,
1147
			    struct btrfs_block_group *block_group,
1148
			    struct btrfs_io_ctl *io_ctl,
1149
			    int *entries)
1150 1151 1152 1153
{
	u64 start, extent_start, extent_end, len;
	struct extent_io_tree *unpin = NULL;
	int ret;
1154

1155 1156 1157
	if (!block_group)
		return 0;

1158 1159 1160
	/*
	 * We want to add any pinned extents to our free space cache
	 * so we don't leak the space
1161
	 *
1162 1163 1164
	 * We shouldn't have switched the pinned extents yet so this is the
	 * right one
	 */
1165
	unpin = &trans->transaction->pinned_extents;
1166

1167
	start = block_group->start;
1168

1169
	while (start < block_group->start + block_group->length) {
1170 1171
		ret = find_first_extent_bit(unpin, start,
					    &extent_start, &extent_end,
1172
					    EXTENT_DIRTY, NULL);
1173 1174
		if (ret)
			return 0;
J
Josef Bacik 已提交
1175

1176
		/* This pinned extent is out of our range */
1177
		if (extent_start >= block_group->start + block_group->length)
1178
			return 0;
1179

1180
		extent_start = max(extent_start, start);
1181 1182
		extent_end = min(block_group->start + block_group->length,
				 extent_end + 1);
1183
		len = extent_end - extent_start;
J
Josef Bacik 已提交
1184

1185 1186
		*entries += 1;
		ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
1187
		if (ret)
1188
			return -ENOSPC;
J
Josef Bacik 已提交
1189

1190
		start = extent_end;
1191
	}
J
Josef Bacik 已提交
1192

1193 1194 1195 1196
	return 0;
}

static noinline_for_stack int
1197
write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
1198
{
1199
	struct btrfs_free_space *entry, *next;
1200 1201
	int ret;

J
Josef Bacik 已提交
1202
	/* Write out the bitmaps */
1203
	list_for_each_entry_safe(entry, next, bitmap_list, list) {
1204
		ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
1205
		if (ret)
1206
			return -ENOSPC;
J
Josef Bacik 已提交
1207
		list_del_init(&entry->list);
1208 1209
	}

1210 1211
	return 0;
}
J
Josef Bacik 已提交
1212

1213 1214 1215
static int flush_dirty_cache(struct inode *inode)
{
	int ret;
1216

1217
	ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
1218
	if (ret)
1219
		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1220
				 EXTENT_DELALLOC, 0, 0, NULL);
J
Josef Bacik 已提交
1221

1222
	return ret;
1223 1224 1225
}

static void noinline_for_stack
1226
cleanup_bitmap_list(struct list_head *bitmap_list)
1227
{
1228
	struct btrfs_free_space *entry, *next;
1229

1230
	list_for_each_entry_safe(entry, next, bitmap_list, list)
1231
		list_del_init(&entry->list);
1232 1233 1234 1235 1236
}

static void noinline_for_stack
cleanup_write_cache_enospc(struct inode *inode,
			   struct btrfs_io_ctl *io_ctl,
1237
			   struct extent_state **cached_state)
1238
{
1239 1240
	io_ctl_drop_pages(io_ctl);
	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1241
			     i_size_read(inode) - 1, cached_state);
1242
}
1243

1244 1245
static int __btrfs_wait_cache_io(struct btrfs_root *root,
				 struct btrfs_trans_handle *trans,
1246
				 struct btrfs_block_group *block_group,
1247 1248
				 struct btrfs_io_ctl *io_ctl,
				 struct btrfs_path *path, u64 offset)
1249 1250 1251 1252
{
	int ret;
	struct inode *inode = io_ctl->inode;

1253 1254 1255
	if (!inode)
		return 0;

1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267
	/* Flush the dirty pages in the cache file. */
	ret = flush_dirty_cache(inode);
	if (ret)
		goto out;

	/* Update the cache item to tell everyone this cache file is valid. */
	ret = update_cache_item(trans, root, inode, path, offset,
				io_ctl->entries, io_ctl->bitmaps);
out:
	if (ret) {
		invalidate_inode_pages2(inode->i_mapping);
		BTRFS_I(inode)->generation = 0;
1268 1269
		if (block_group)
			btrfs_debug(root->fs_info,
1270 1271
	  "failed to write free space cache for block group %llu error %d",
				  block_group->start, ret);
1272
	}
1273
	btrfs_update_inode(trans, root, BTRFS_I(inode));
1274 1275

	if (block_group) {
1276 1277 1278 1279
		/* the dirty list is protected by the dirty_bgs_lock */
		spin_lock(&trans->transaction->dirty_bgs_lock);

		/* the disk_cache_state is protected by the block group lock */
1280 1281 1282 1283
		spin_lock(&block_group->lock);

		/*
		 * only mark this as written if we didn't get put back on
1284 1285
		 * the dirty list while waiting for IO.   Otherwise our
		 * cache state won't be right, and we won't get written again
1286 1287 1288 1289 1290 1291 1292
		 */
		if (!ret && list_empty(&block_group->dirty_list))
			block_group->disk_cache_state = BTRFS_DC_WRITTEN;
		else if (ret)
			block_group->disk_cache_state = BTRFS_DC_ERROR;

		spin_unlock(&block_group->lock);
1293
		spin_unlock(&trans->transaction->dirty_bgs_lock);
1294 1295 1296 1297 1298 1299 1300 1301
		io_ctl->inode = NULL;
		iput(inode);
	}

	return ret;

}

1302
int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
1303
			struct btrfs_block_group *block_group,
1304 1305 1306 1307
			struct btrfs_path *path)
{
	return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
				     block_group, &block_group->io_ctl,
1308
				     path, block_group->start);
1309 1310
}

1311
/**
1312 1313 1314 1315 1316 1317 1318 1319
 * Write out cached info to an inode
 *
 * @root:        root the inode belongs to
 * @inode:       freespace inode we are writing out
 * @ctl:         free space cache we are going to write out
 * @block_group: block_group for this cache if it belongs to a block_group
 * @io_ctl:      holds context for the io
 * @trans:       the trans handle
1320 1321
 *
 * This function writes out a free space cache struct to disk for quick recovery
G
Geliang Tang 已提交
1322
 * on mount.  This will return 0 if it was successful in writing the cache out,
1323
 * or an errno if it was not.
1324 1325 1326
 */
static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
				   struct btrfs_free_space_ctl *ctl,
1327
				   struct btrfs_block_group *block_group,
1328
				   struct btrfs_io_ctl *io_ctl,
1329
				   struct btrfs_trans_handle *trans)
1330 1331
{
	struct extent_state *cached_state = NULL;
1332
	LIST_HEAD(bitmap_list);
1333 1334 1335
	int entries = 0;
	int bitmaps = 0;
	int ret;
1336
	int must_iput = 0;
1337 1338

	if (!i_size_read(inode))
1339
		return -EIO;
1340

1341
	WARN_ON(io_ctl->pages);
1342
	ret = io_ctl_init(io_ctl, inode, 1);
1343
	if (ret)
1344
		return ret;
1345

1346 1347 1348 1349 1350 1351 1352 1353 1354
	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
		down_write(&block_group->data_rwsem);
		spin_lock(&block_group->lock);
		if (block_group->delalloc_bytes) {
			block_group->disk_cache_state = BTRFS_DC_WRITTEN;
			spin_unlock(&block_group->lock);
			up_write(&block_group->data_rwsem);
			BTRFS_I(inode)->generation = 0;
			ret = 0;
1355
			must_iput = 1;
1356 1357 1358 1359 1360
			goto out;
		}
		spin_unlock(&block_group->lock);
	}

1361
	/* Lock all pages first so we can lock the extent safely. */
1362
	ret = io_ctl_prepare_pages(io_ctl, false);
1363
	if (ret)
1364
		goto out_unlock;
1365 1366

	lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1367
			 &cached_state);
1368

1369
	io_ctl_set_generation(io_ctl, trans->transid);
1370

1371
	mutex_lock(&ctl->cache_writeout_mutex);
1372
	/* Write out the extent entries in the free space cache */
1373
	spin_lock(&ctl->tree_lock);
1374
	ret = write_cache_extent_entries(io_ctl, ctl,
1375 1376
					 block_group, &entries, &bitmaps,
					 &bitmap_list);
1377 1378
	if (ret)
		goto out_nospc_locked;
1379

1380 1381 1382 1383
	/*
	 * Some spaces that are freed in the current transaction are pinned,
	 * they will be added into free space cache after the transaction is
	 * committed, we shouldn't lose them.
1384 1385 1386
	 *
	 * If this changes while we are working we'll get added back to
	 * the dirty list and redo it.  No locking needed
1387
	 */
1388
	ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries);
1389 1390
	if (ret)
		goto out_nospc_locked;
1391

1392 1393 1394 1395 1396
	/*
	 * At last, we write out all the bitmaps and keep cache_writeout_mutex
	 * locked while doing it because a concurrent trim can be manipulating
	 * or freeing the bitmap.
	 */
1397
	ret = write_bitmap_entries(io_ctl, &bitmap_list);
1398
	spin_unlock(&ctl->tree_lock);
1399
	mutex_unlock(&ctl->cache_writeout_mutex);
1400 1401 1402 1403
	if (ret)
		goto out_nospc;

	/* Zero out the rest of the pages just to make sure */
1404
	io_ctl_zero_remaining_pages(io_ctl);
1405

1406
	/* Everything is written out, now we dirty the pages in the file. */
1407 1408
	ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages,
				io_ctl->num_pages, 0, i_size_read(inode),
1409
				&cached_state, false);
1410
	if (ret)
1411
		goto out_nospc;
1412

1413 1414
	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
		up_write(&block_group->data_rwsem);
1415 1416 1417 1418
	/*
	 * Release the pages and unlock the extent, we will flush
	 * them out later
	 */
1419
	io_ctl_drop_pages(io_ctl);
1420
	io_ctl_free(io_ctl);
1421 1422

	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1423
			     i_size_read(inode) - 1, &cached_state);
1424

1425 1426
	/*
	 * at this point the pages are under IO and we're happy,
1427
	 * The caller is responsible for waiting on them and updating
1428 1429 1430 1431 1432 1433
	 * the cache and the inode
	 */
	io_ctl->entries = entries;
	io_ctl->bitmaps = bitmaps;

	ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
1434
	if (ret)
1435 1436
		goto out;

1437 1438
	return 0;

1439 1440 1441 1442 1443
out_nospc_locked:
	cleanup_bitmap_list(&bitmap_list);
	spin_unlock(&ctl->tree_lock);
	mutex_unlock(&ctl->cache_writeout_mutex);

1444
out_nospc:
1445
	cleanup_write_cache_enospc(inode, io_ctl, &cached_state);
1446

1447
out_unlock:
1448 1449 1450
	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
		up_write(&block_group->data_rwsem);

1451 1452 1453 1454 1455 1456 1457
out:
	io_ctl->inode = NULL;
	io_ctl_free(io_ctl);
	if (ret) {
		invalidate_inode_pages2(inode->i_mapping);
		BTRFS_I(inode)->generation = 0;
	}
1458
	btrfs_update_inode(trans, root, BTRFS_I(inode));
1459 1460 1461
	if (must_iput)
		iput(inode);
	return ret;
1462 1463
}

1464
int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
1465
			  struct btrfs_block_group *block_group,
1466 1467
			  struct btrfs_path *path)
{
1468
	struct btrfs_fs_info *fs_info = trans->fs_info;
1469 1470 1471 1472 1473 1474 1475
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
	struct inode *inode;
	int ret = 0;

	spin_lock(&block_group->lock);
	if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
		spin_unlock(&block_group->lock);
1476 1477
		return 0;
	}
1478 1479
	spin_unlock(&block_group->lock);

1480
	inode = lookup_free_space_inode(block_group, path);
1481 1482 1483
	if (IS_ERR(inode))
		return 0;

1484 1485
	ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
				block_group, &block_group->io_ctl, trans);
1486
	if (ret) {
1487
		btrfs_debug(fs_info,
1488 1489
	  "failed to write free space cache for block group %llu error %d",
			  block_group->start, ret);
1490 1491 1492 1493 1494 1495
		spin_lock(&block_group->lock);
		block_group->disk_cache_state = BTRFS_DC_ERROR;
		spin_unlock(&block_group->lock);

		block_group->io_ctl.inode = NULL;
		iput(inode);
1496 1497
	}

1498 1499 1500 1501 1502
	/*
	 * if ret == 0 the caller is expected to call btrfs_wait_cache_io
	 * to wait for IO and put the inode
	 */

J
Josef Bacik 已提交
1503 1504 1505
	return ret;
}

1506
static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
1507
					  u64 offset)
J
Josef Bacik 已提交
1508
{
1509
	ASSERT(offset >= bitmap_start);
1510
	offset -= bitmap_start;
1511
	return (unsigned long)(div_u64(offset, unit));
1512
}
J
Josef Bacik 已提交
1513

1514
static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
1515
{
1516
	return (unsigned long)(div_u64(bytes, unit));
1517
}
J
Josef Bacik 已提交
1518

1519
static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
1520 1521 1522
				   u64 offset)
{
	u64 bitmap_start;
1523
	u64 bytes_per_bitmap;
J
Josef Bacik 已提交
1524

1525 1526
	bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
	bitmap_start = offset - ctl->start;
1527
	bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
1528
	bitmap_start *= bytes_per_bitmap;
1529
	bitmap_start += ctl->start;
J
Josef Bacik 已提交
1530

1531
	return bitmap_start;
J
Josef Bacik 已提交
1532 1533
}

1534 1535
static int tree_insert_offset(struct rb_root *root, u64 offset,
			      struct rb_node *node, int bitmap)
J
Josef Bacik 已提交
1536 1537 1538 1539 1540 1541 1542
{
	struct rb_node **p = &root->rb_node;
	struct rb_node *parent = NULL;
	struct btrfs_free_space *info;

	while (*p) {
		parent = *p;
1543
		info = rb_entry(parent, struct btrfs_free_space, offset_index);
J
Josef Bacik 已提交
1544

1545
		if (offset < info->offset) {
J
Josef Bacik 已提交
1546
			p = &(*p)->rb_left;
1547
		} else if (offset > info->offset) {
J
Josef Bacik 已提交
1548
			p = &(*p)->rb_right;
1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563
		} else {
			/*
			 * we could have a bitmap entry and an extent entry
			 * share the same offset.  If this is the case, we want
			 * the extent entry to always be found first if we do a
			 * linear search through the tree, since we want to have
			 * the quickest allocation time, and allocating from an
			 * extent is faster than allocating from a bitmap.  So
			 * if we're inserting a bitmap and we find an entry at
			 * this offset, we want to go right, or after this entry
			 * logically.  If we are inserting an extent and we've
			 * found a bitmap, we want to go left, or before
			 * logically.
			 */
			if (bitmap) {
1564 1565 1566 1567
				if (info->bitmap) {
					WARN_ON_ONCE(1);
					return -EEXIST;
				}
1568 1569
				p = &(*p)->rb_right;
			} else {
1570 1571 1572 1573
				if (!info->bitmap) {
					WARN_ON_ONCE(1);
					return -EEXIST;
				}
1574 1575 1576
				p = &(*p)->rb_left;
			}
		}
J
Josef Bacik 已提交
1577 1578 1579 1580 1581 1582 1583 1584 1585
	}

	rb_link_node(node, parent, p);
	rb_insert_color(node, root);

	return 0;
}

/*
J
Josef Bacik 已提交
1586 1587
 * searches the tree for the given offset.
 *
1588 1589 1590
 * fuzzy - If this is set, then we are trying to make an allocation, and we just
 * want a section that has at least bytes size and comes at or after the given
 * offset.
J
Josef Bacik 已提交
1591
 */
1592
static struct btrfs_free_space *
1593
tree_search_offset(struct btrfs_free_space_ctl *ctl,
1594
		   u64 offset, int bitmap_only, int fuzzy)
J
Josef Bacik 已提交
1595
{
1596
	struct rb_node *n = ctl->free_space_offset.rb_node;
1597 1598 1599 1600 1601 1602 1603 1604
	struct btrfs_free_space *entry, *prev = NULL;

	/* find entry that is closest to the 'offset' */
	while (1) {
		if (!n) {
			entry = NULL;
			break;
		}
J
Josef Bacik 已提交
1605 1606

		entry = rb_entry(n, struct btrfs_free_space, offset_index);
1607
		prev = entry;
J
Josef Bacik 已提交
1608

1609
		if (offset < entry->offset)
J
Josef Bacik 已提交
1610
			n = n->rb_left;
1611
		else if (offset > entry->offset)
J
Josef Bacik 已提交
1612
			n = n->rb_right;
1613
		else
J
Josef Bacik 已提交
1614 1615 1616
			break;
	}

1617 1618 1619 1620 1621
	if (bitmap_only) {
		if (!entry)
			return NULL;
		if (entry->bitmap)
			return entry;
J
Josef Bacik 已提交
1622

1623 1624 1625 1626 1627 1628 1629 1630 1631 1632
		/*
		 * bitmap entry and extent entry may share same offset,
		 * in that case, bitmap entry comes after extent entry.
		 */
		n = rb_next(n);
		if (!n)
			return NULL;
		entry = rb_entry(n, struct btrfs_free_space, offset_index);
		if (entry->offset != offset)
			return NULL;
J
Josef Bacik 已提交
1633

1634 1635 1636 1637
		WARN_ON(!entry->bitmap);
		return entry;
	} else if (entry) {
		if (entry->bitmap) {
J
Josef Bacik 已提交
1638
			/*
1639 1640
			 * if previous extent entry covers the offset,
			 * we should return it instead of the bitmap entry
J
Josef Bacik 已提交
1641
			 */
1642 1643
			n = rb_prev(&entry->offset_index);
			if (n) {
1644 1645
				prev = rb_entry(n, struct btrfs_free_space,
						offset_index);
1646 1647 1648
				if (!prev->bitmap &&
				    prev->offset + prev->bytes > offset)
					entry = prev;
J
Josef Bacik 已提交
1649
			}
1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663
		}
		return entry;
	}

	if (!prev)
		return NULL;

	/* find last entry before the 'offset' */
	entry = prev;
	if (entry->offset > offset) {
		n = rb_prev(&entry->offset_index);
		if (n) {
			entry = rb_entry(n, struct btrfs_free_space,
					offset_index);
1664
			ASSERT(entry->offset <= offset);
J
Josef Bacik 已提交
1665
		} else {
1666 1667 1668 1669
			if (fuzzy)
				return entry;
			else
				return NULL;
J
Josef Bacik 已提交
1670 1671 1672
		}
	}

1673
	if (entry->bitmap) {
1674 1675
		n = rb_prev(&entry->offset_index);
		if (n) {
1676 1677
			prev = rb_entry(n, struct btrfs_free_space,
					offset_index);
1678 1679 1680
			if (!prev->bitmap &&
			    prev->offset + prev->bytes > offset)
				return prev;
1681
		}
1682
		if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1683 1684 1685 1686 1687 1688 1689 1690 1691 1692
			return entry;
	} else if (entry->offset + entry->bytes > offset)
		return entry;

	if (!fuzzy)
		return NULL;

	while (1) {
		if (entry->bitmap) {
			if (entry->offset + BITS_PER_BITMAP *
1693
			    ctl->unit > offset)
1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705
				break;
		} else {
			if (entry->offset + entry->bytes > offset)
				break;
		}

		n = rb_next(&entry->offset_index);
		if (!n)
			return NULL;
		entry = rb_entry(n, struct btrfs_free_space, offset_index);
	}
	return entry;
J
Josef Bacik 已提交
1706 1707
}

1708
static inline void
1709
__unlink_free_space(struct btrfs_free_space_ctl *ctl,
1710
		    struct btrfs_free_space *info)
J
Josef Bacik 已提交
1711
{
1712 1713
	rb_erase(&info->offset_index, &ctl->free_space_offset);
	ctl->free_extents--;
1714

1715
	if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
1716
		ctl->discardable_extents[BTRFS_STAT_CURR]--;
1717 1718
		ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes;
	}
1719 1720
}

1721
static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
1722 1723
			      struct btrfs_free_space *info)
{
1724 1725
	__unlink_free_space(ctl, info);
	ctl->free_space -= info->bytes;
J
Josef Bacik 已提交
1726 1727
}

1728
static int link_free_space(struct btrfs_free_space_ctl *ctl,
J
Josef Bacik 已提交
1729 1730 1731 1732
			   struct btrfs_free_space *info)
{
	int ret = 0;

1733
	ASSERT(info->bytes || info->bitmap);
1734
	ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
1735
				 &info->offset_index, (info->bitmap != NULL));
J
Josef Bacik 已提交
1736 1737 1738
	if (ret)
		return ret;

1739
	if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
1740
		ctl->discardable_extents[BTRFS_STAT_CURR]++;
1741 1742
		ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
	}
1743

1744 1745
	ctl->free_space += info->bytes;
	ctl->free_extents++;
1746 1747 1748
	return ret;
}

1749 1750 1751
static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
				       struct btrfs_free_space *info,
				       u64 offset, u64 bytes)
1752
{
1753 1754
	unsigned long start, count, end;
	int extent_delta = -1;
1755

1756 1757
	start = offset_to_bit(info->offset, ctl->unit, offset);
	count = bytes_to_bits(bytes, ctl->unit);
1758 1759
	end = start + count;
	ASSERT(end <= BITS_PER_BITMAP);
1760

L
Li Zefan 已提交
1761
	bitmap_clear(info->bitmap, start, count);
1762 1763

	info->bytes -= bytes;
1764 1765
	if (info->max_extent_size > ctl->unit)
		info->max_extent_size = 0;
1766 1767 1768 1769 1770 1771 1772 1773

	if (start && test_bit(start - 1, info->bitmap))
		extent_delta++;

	if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
		extent_delta++;

	info->bitmap_extents += extent_delta;
1774
	if (!btrfs_free_space_trimmed(info)) {
1775
		ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
1776 1777
		ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
	}
1778 1779 1780 1781 1782 1783 1784
}

static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
			      struct btrfs_free_space *info, u64 offset,
			      u64 bytes)
{
	__bitmap_clear_bits(ctl, info, offset, bytes);
1785
	ctl->free_space -= bytes;
1786 1787
}

1788
static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
J
Josef Bacik 已提交
1789 1790
			    struct btrfs_free_space *info, u64 offset,
			    u64 bytes)
1791
{
1792 1793
	unsigned long start, count, end;
	int extent_delta = 1;
1794

1795 1796
	start = offset_to_bit(info->offset, ctl->unit, offset);
	count = bytes_to_bits(bytes, ctl->unit);
1797 1798
	end = start + count;
	ASSERT(end <= BITS_PER_BITMAP);
1799

L
Li Zefan 已提交
1800
	bitmap_set(info->bitmap, start, count);
1801 1802

	info->bytes += bytes;
1803
	ctl->free_space += bytes;
1804 1805 1806 1807 1808 1809 1810 1811

	if (start && test_bit(start - 1, info->bitmap))
		extent_delta--;

	if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
		extent_delta--;

	info->bitmap_extents += extent_delta;
1812
	if (!btrfs_free_space_trimmed(info)) {
1813
		ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
1814 1815
		ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes;
	}
1816 1817
}

1818 1819 1820 1821
/*
 * If we can not find suitable extent, we will use bytes to record
 * the size of the max extent.
 */
1822
static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1823
			 struct btrfs_free_space *bitmap_info, u64 *offset,
1824
			 u64 *bytes, bool for_alloc)
1825 1826
{
	unsigned long found_bits = 0;
1827
	unsigned long max_bits = 0;
1828 1829
	unsigned long bits, i;
	unsigned long next_zero;
1830
	unsigned long extent_bits;
1831

1832 1833 1834 1835
	/*
	 * Skip searching the bitmap if we don't have a contiguous section that
	 * is large enough for this allocation.
	 */
1836 1837
	if (for_alloc &&
	    bitmap_info->max_extent_size &&
1838 1839 1840 1841 1842
	    bitmap_info->max_extent_size < *bytes) {
		*bytes = bitmap_info->max_extent_size;
		return -1;
	}

1843
	i = offset_to_bit(bitmap_info->offset, ctl->unit,
1844
			  max_t(u64, *offset, bitmap_info->offset));
1845
	bits = bytes_to_bits(*bytes, ctl->unit);
1846

1847
	for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
1848 1849 1850 1851
		if (for_alloc && bits == 1) {
			found_bits = 1;
			break;
		}
1852 1853
		next_zero = find_next_zero_bit(bitmap_info->bitmap,
					       BITS_PER_BITMAP, i);
1854 1855 1856
		extent_bits = next_zero - i;
		if (extent_bits >= bits) {
			found_bits = extent_bits;
1857
			break;
1858 1859
		} else if (extent_bits > max_bits) {
			max_bits = extent_bits;
1860 1861 1862 1863 1864
		}
		i = next_zero;
	}

	if (found_bits) {
1865 1866
		*offset = (u64)(i * ctl->unit) + bitmap_info->offset;
		*bytes = (u64)(found_bits) * ctl->unit;
1867 1868 1869
		return 0;
	}

1870
	*bytes = (u64)(max_bits) * ctl->unit;
1871
	bitmap_info->max_extent_size = *bytes;
1872 1873 1874
	return -1;
}

J
Josef Bacik 已提交
1875 1876 1877 1878 1879 1880 1881
static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
{
	if (entry->bitmap)
		return entry->max_extent_size;
	return entry->bytes;
}

1882
/* Cache the size of the max extent in bytes */
1883
static struct btrfs_free_space *
D
David Woodhouse 已提交
1884
find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
1885
		unsigned long align, u64 *max_extent_size)
1886 1887 1888
{
	struct btrfs_free_space *entry;
	struct rb_node *node;
D
David Woodhouse 已提交
1889 1890
	u64 tmp;
	u64 align_off;
1891 1892
	int ret;

1893
	if (!ctl->free_space_offset.rb_node)
1894
		goto out;
1895

1896
	entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
1897
	if (!entry)
1898
		goto out;
1899 1900 1901

	for (node = &entry->offset_index; node; node = rb_next(node)) {
		entry = rb_entry(node, struct btrfs_free_space, offset_index);
1902
		if (entry->bytes < *bytes) {
J
Josef Bacik 已提交
1903 1904
			*max_extent_size = max(get_max_extent_size(entry),
					       *max_extent_size);
1905
			continue;
1906
		}
1907

D
David Woodhouse 已提交
1908 1909 1910 1911
		/* make sure the space returned is big enough
		 * to match our requested alignment
		 */
		if (*bytes >= align) {
1912
			tmp = entry->offset - ctl->start + align - 1;
1913
			tmp = div64_u64(tmp, align);
D
David Woodhouse 已提交
1914 1915 1916 1917 1918 1919 1920
			tmp = tmp * align + ctl->start;
			align_off = tmp - entry->offset;
		} else {
			align_off = 0;
			tmp = entry->offset;
		}

1921
		if (entry->bytes < *bytes + align_off) {
J
Josef Bacik 已提交
1922 1923
			*max_extent_size = max(get_max_extent_size(entry),
					       *max_extent_size);
D
David Woodhouse 已提交
1924
			continue;
1925
		}
D
David Woodhouse 已提交
1926

1927
		if (entry->bitmap) {
1928 1929
			u64 size = *bytes;

1930
			ret = search_bitmap(ctl, entry, &tmp, &size, true);
D
David Woodhouse 已提交
1931 1932
			if (!ret) {
				*offset = tmp;
1933
				*bytes = size;
1934
				return entry;
J
Josef Bacik 已提交
1935 1936 1937 1938
			} else {
				*max_extent_size =
					max(get_max_extent_size(entry),
					    *max_extent_size);
D
David Woodhouse 已提交
1939
			}
1940 1941 1942
			continue;
		}

D
David Woodhouse 已提交
1943 1944
		*offset = tmp;
		*bytes = entry->bytes - align_off;
1945 1946
		return entry;
	}
1947
out:
1948 1949 1950
	return NULL;
}

1951
static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
1952 1953
			   struct btrfs_free_space *info, u64 offset)
{
1954
	info->offset = offset_to_bitmap(ctl, offset);
J
Josef Bacik 已提交
1955
	info->bytes = 0;
1956
	info->bitmap_extents = 0;
1957
	INIT_LIST_HEAD(&info->list);
1958 1959
	link_free_space(ctl, info);
	ctl->total_bitmaps++;
1960
	recalculate_thresholds(ctl);
1961 1962
}

1963
static void free_bitmap(struct btrfs_free_space_ctl *ctl,
1964 1965
			struct btrfs_free_space *bitmap_info)
{
1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977
	/*
	 * Normally when this is called, the bitmap is completely empty. However,
	 * if we are blowing up the free space cache for one reason or another
	 * via __btrfs_remove_free_space_cache(), then it may not be freed and
	 * we may leave stats on the table.
	 */
	if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) {
		ctl->discardable_extents[BTRFS_STAT_CURR] -=
			bitmap_info->bitmap_extents;
		ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes;

	}
1978
	unlink_free_space(ctl, bitmap_info);
1979
	kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
1980
	kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
1981
	ctl->total_bitmaps--;
1982
	recalculate_thresholds(ctl);
1983 1984
}

1985
static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
1986 1987 1988 1989
			      struct btrfs_free_space *bitmap_info,
			      u64 *offset, u64 *bytes)
{
	u64 end;
1990 1991
	u64 search_start, search_bytes;
	int ret;
1992 1993

again:
1994
	end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
1995

1996
	/*
1997 1998 1999 2000
	 * We need to search for bits in this bitmap.  We could only cover some
	 * of the extent in this bitmap thanks to how we add space, so we need
	 * to search for as much as it as we can and clear that amount, and then
	 * go searching for the next bit.
2001 2002
	 */
	search_start = *offset;
2003
	search_bytes = ctl->unit;
2004
	search_bytes = min(search_bytes, end - search_start + 1);
2005 2006
	ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
			    false);
2007 2008
	if (ret < 0 || search_start != *offset)
		return -EINVAL;
2009

2010 2011 2012 2013 2014 2015 2016 2017 2018
	/* We may have found more bits than what we need */
	search_bytes = min(search_bytes, *bytes);

	/* Cannot clear past the end of the bitmap */
	search_bytes = min(search_bytes, end - search_start + 1);

	bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
	*offset += search_bytes;
	*bytes -= search_bytes;
2019 2020

	if (*bytes) {
2021
		struct rb_node *next = rb_next(&bitmap_info->offset_index);
2022
		if (!bitmap_info->bytes)
2023
			free_bitmap(ctl, bitmap_info);
2024

2025 2026 2027 2028 2029
		/*
		 * no entry after this bitmap, but we still have bytes to
		 * remove, so something has gone wrong.
		 */
		if (!next)
2030 2031
			return -EINVAL;

2032 2033 2034 2035 2036 2037 2038
		bitmap_info = rb_entry(next, struct btrfs_free_space,
				       offset_index);

		/*
		 * if the next entry isn't a bitmap we need to return to let the
		 * extent stuff do its work.
		 */
2039 2040 2041
		if (!bitmap_info->bitmap)
			return -EAGAIN;

2042 2043 2044 2045 2046 2047 2048
		/*
		 * Ok the next item is a bitmap, but it may not actually hold
		 * the information for the rest of this free space stuff, so
		 * look for it, and if we don't find it return so we can try
		 * everything over again.
		 */
		search_start = *offset;
2049
		search_bytes = ctl->unit;
2050
		ret = search_bitmap(ctl, bitmap_info, &search_start,
2051
				    &search_bytes, false);
2052 2053 2054
		if (ret < 0 || search_start != *offset)
			return -EAGAIN;

2055
		goto again;
2056
	} else if (!bitmap_info->bytes)
2057
		free_bitmap(ctl, bitmap_info);
2058 2059 2060 2061

	return 0;
}

J
Josef Bacik 已提交
2062 2063
static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
			       struct btrfs_free_space *info, u64 offset,
2064
			       u64 bytes, enum btrfs_trim_state trim_state)
J
Josef Bacik 已提交
2065 2066 2067 2068
{
	u64 bytes_to_set = 0;
	u64 end;

2069 2070 2071 2072
	/*
	 * This is a tradeoff to make bitmap trim state minimal.  We mark the
	 * whole bitmap untrimmed if at any point we add untrimmed regions.
	 */
2073
	if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) {
2074
		if (btrfs_free_space_trimmed(info)) {
2075 2076
			ctl->discardable_extents[BTRFS_STAT_CURR] +=
				info->bitmap_extents;
2077 2078
			ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
		}
2079
		info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2080
	}
2081

J
Josef Bacik 已提交
2082 2083 2084 2085 2086 2087
	end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);

	bytes_to_set = min(end - offset, bytes);

	bitmap_set_bits(ctl, info, offset, bytes_to_set);

2088 2089 2090 2091 2092 2093
	/*
	 * We set some bytes, we have no idea what the max extent size is
	 * anymore.
	 */
	info->max_extent_size = 0;

J
Josef Bacik 已提交
2094 2095 2096 2097
	return bytes_to_set;

}

2098 2099
static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
		      struct btrfs_free_space *info)
2100
{
2101
	struct btrfs_block_group *block_group = ctl->private;
2102
	struct btrfs_fs_info *fs_info = block_group->fs_info;
2103 2104 2105
	bool forced = false;

#ifdef CONFIG_BTRFS_DEBUG
2106
	if (btrfs_should_fragment_free_space(block_group))
2107 2108
		forced = true;
#endif
2109

2110 2111 2112 2113
	/* This is a way to reclaim large regions from the bitmaps. */
	if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD)
		return false;

2114 2115 2116 2117
	/*
	 * If we are below the extents threshold then we can add this as an
	 * extent, and don't have to deal with the bitmap
	 */
2118
	if (!forced && ctl->free_extents < ctl->extents_thresh) {
2119 2120 2121
		/*
		 * If this block group has some small extents we don't want to
		 * use up all of our free slots in the cache with them, we want
2122
		 * to reserve them to larger extents, however if we have plenty
2123 2124 2125
		 * of cache left then go ahead an dadd them, no sense in adding
		 * the overhead of a bitmap if we don't have to.
		 */
2126 2127
		if (info->bytes <= fs_info->sectorsize * 8) {
			if (ctl->free_extents * 3 <= ctl->extents_thresh)
2128
				return false;
2129
		} else {
2130
			return false;
2131 2132
		}
	}
2133 2134

	/*
2135 2136 2137 2138 2139 2140
	 * The original block groups from mkfs can be really small, like 8
	 * megabytes, so don't bother with a bitmap for those entries.  However
	 * some block groups can be smaller than what a bitmap would cover but
	 * are still large enough that they could overflow the 32k memory limit,
	 * so allow those block groups to still be allowed to have a bitmap
	 * entry.
2141
	 */
2142
	if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length)
2143 2144 2145 2146 2147
		return false;

	return true;
}

2148
static const struct btrfs_free_space_op free_space_op = {
J
Josef Bacik 已提交
2149 2150 2151
	.use_bitmap		= use_bitmap,
};

2152 2153 2154 2155
static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
			      struct btrfs_free_space *info)
{
	struct btrfs_free_space *bitmap_info;
2156
	struct btrfs_block_group *block_group = NULL;
2157
	int added = 0;
J
Josef Bacik 已提交
2158
	u64 bytes, offset, bytes_added;
2159
	enum btrfs_trim_state trim_state;
2160
	int ret;
2161 2162 2163

	bytes = info->bytes;
	offset = info->offset;
2164
	trim_state = info->trim_state;
2165

2166 2167 2168
	if (!ctl->op->use_bitmap(ctl, info))
		return 0;

J
Josef Bacik 已提交
2169 2170
	if (ctl->op == &free_space_op)
		block_group = ctl->private;
2171
again:
J
Josef Bacik 已提交
2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188
	/*
	 * Since we link bitmaps right into the cluster we need to see if we
	 * have a cluster here, and if so and it has our bitmap we need to add
	 * the free space to that bitmap.
	 */
	if (block_group && !list_empty(&block_group->cluster_list)) {
		struct btrfs_free_cluster *cluster;
		struct rb_node *node;
		struct btrfs_free_space *entry;

		cluster = list_entry(block_group->cluster_list.next,
				     struct btrfs_free_cluster,
				     block_group_list);
		spin_lock(&cluster->lock);
		node = rb_first(&cluster->root);
		if (!node) {
			spin_unlock(&cluster->lock);
2189
			goto no_cluster_bitmap;
J
Josef Bacik 已提交
2190 2191 2192 2193 2194
		}

		entry = rb_entry(node, struct btrfs_free_space, offset_index);
		if (!entry->bitmap) {
			spin_unlock(&cluster->lock);
2195
			goto no_cluster_bitmap;
J
Josef Bacik 已提交
2196 2197 2198
		}

		if (entry->offset == offset_to_bitmap(ctl, offset)) {
2199 2200
			bytes_added = add_bytes_to_bitmap(ctl, entry, offset,
							  bytes, trim_state);
J
Josef Bacik 已提交
2201 2202 2203 2204 2205 2206 2207 2208 2209
			bytes -= bytes_added;
			offset += bytes_added;
		}
		spin_unlock(&cluster->lock);
		if (!bytes) {
			ret = 1;
			goto out;
		}
	}
2210 2211

no_cluster_bitmap:
2212
	bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2213 2214
					 1, 0);
	if (!bitmap_info) {
2215
		ASSERT(added == 0);
2216 2217 2218
		goto new_bitmap;
	}

2219 2220
	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
					  trim_state);
J
Josef Bacik 已提交
2221 2222 2223
	bytes -= bytes_added;
	offset += bytes_added;
	added = 0;
2224 2225 2226 2227 2228 2229 2230 2231 2232

	if (!bytes) {
		ret = 1;
		goto out;
	} else
		goto again;

new_bitmap:
	if (info && info->bitmap) {
2233
		add_new_bitmap(ctl, info, offset);
2234 2235 2236 2237
		added = 1;
		info = NULL;
		goto again;
	} else {
2238
		spin_unlock(&ctl->tree_lock);
2239 2240 2241

		/* no pre-allocated info, allocate a new one */
		if (!info) {
2242 2243
			info = kmem_cache_zalloc(btrfs_free_space_cachep,
						 GFP_NOFS);
2244
			if (!info) {
2245
				spin_lock(&ctl->tree_lock);
2246 2247 2248 2249 2250 2251
				ret = -ENOMEM;
				goto out;
			}
		}

		/* allocate the bitmap */
2252 2253
		info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep,
						 GFP_NOFS);
2254
		info->trim_state = BTRFS_TRIM_STATE_TRIMMED;
2255
		spin_lock(&ctl->tree_lock);
2256 2257 2258 2259 2260 2261 2262 2263 2264
		if (!info->bitmap) {
			ret = -ENOMEM;
			goto out;
		}
		goto again;
	}

out:
	if (info) {
2265 2266 2267
		if (info->bitmap)
			kmem_cache_free(btrfs_free_space_bitmap_cachep,
					info->bitmap);
2268
		kmem_cache_free(btrfs_free_space_cachep, info);
2269
	}
J
Josef Bacik 已提交
2270 2271 2272 2273

	return ret;
}

2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289
/*
 * Free space merging rules:
 *  1) Merge trimmed areas together
 *  2) Let untrimmed areas coalesce with trimmed areas
 *  3) Always pull neighboring regions from bitmaps
 *
 * The above rules are for when we merge free space based on btrfs_trim_state.
 * Rules 2 and 3 are subtle because they are suboptimal, but are done for the
 * same reason: to promote larger extent regions which makes life easier for
 * find_free_extent().  Rule 2 enables coalescing based on the common path
 * being returning free space from btrfs_finish_extent_commit().  So when free
 * space is trimmed, it will prevent aggregating trimmed new region and
 * untrimmed regions in the rb_tree.  Rule 3 is purely to obtain larger extents
 * and provide find_free_extent() with the largest extents possible hoping for
 * the reuse path.
 */
2290
static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
2291
			  struct btrfs_free_space *info, bool update_stat)
J
Josef Bacik 已提交
2292
{
2293
	struct btrfs_free_space *left_info = NULL;
2294 2295 2296 2297
	struct btrfs_free_space *right_info;
	bool merged = false;
	u64 offset = info->offset;
	u64 bytes = info->bytes;
2298
	const bool is_trimmed = btrfs_free_space_trimmed(info);
2299

J
Josef Bacik 已提交
2300 2301 2302 2303 2304
	/*
	 * first we want to see if there is free space adjacent to the range we
	 * are adding, if there is remove that struct and add a new one to
	 * cover the entire range
	 */
2305
	right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
2306 2307 2308
	if (right_info && rb_prev(&right_info->offset_index))
		left_info = rb_entry(rb_prev(&right_info->offset_index),
				     struct btrfs_free_space, offset_index);
2309
	else if (!right_info)
2310
		left_info = tree_search_offset(ctl, offset - 1, 0, 0);
J
Josef Bacik 已提交
2311

2312 2313 2314
	/* See try_merge_free_space() comment. */
	if (right_info && !right_info->bitmap &&
	    (!is_trimmed || btrfs_free_space_trimmed(right_info))) {
2315
		if (update_stat)
2316
			unlink_free_space(ctl, right_info);
2317
		else
2318
			__unlink_free_space(ctl, right_info);
2319
		info->bytes += right_info->bytes;
2320
		kmem_cache_free(btrfs_free_space_cachep, right_info);
2321
		merged = true;
J
Josef Bacik 已提交
2322 2323
	}

2324
	/* See try_merge_free_space() comment. */
2325
	if (left_info && !left_info->bitmap &&
2326 2327
	    left_info->offset + left_info->bytes == offset &&
	    (!is_trimmed || btrfs_free_space_trimmed(left_info))) {
2328
		if (update_stat)
2329
			unlink_free_space(ctl, left_info);
2330
		else
2331
			__unlink_free_space(ctl, left_info);
2332 2333
		info->offset = left_info->offset;
		info->bytes += left_info->bytes;
2334
		kmem_cache_free(btrfs_free_space_cachep, left_info);
2335
		merged = true;
J
Josef Bacik 已提交
2336 2337
	}

2338 2339 2340
	return merged;
}

2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362
static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
				     struct btrfs_free_space *info,
				     bool update_stat)
{
	struct btrfs_free_space *bitmap;
	unsigned long i;
	unsigned long j;
	const u64 end = info->offset + info->bytes;
	const u64 bitmap_offset = offset_to_bitmap(ctl, end);
	u64 bytes;

	bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
	if (!bitmap)
		return false;

	i = offset_to_bit(bitmap->offset, ctl->unit, end);
	j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
	if (j == i)
		return false;
	bytes = (j - i) * ctl->unit;
	info->bytes += bytes;

2363 2364 2365 2366
	/* See try_merge_free_space() comment. */
	if (!btrfs_free_space_trimmed(bitmap))
		info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;

2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419
	if (update_stat)
		bitmap_clear_bits(ctl, bitmap, end, bytes);
	else
		__bitmap_clear_bits(ctl, bitmap, end, bytes);

	if (!bitmap->bytes)
		free_bitmap(ctl, bitmap);

	return true;
}

static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
				       struct btrfs_free_space *info,
				       bool update_stat)
{
	struct btrfs_free_space *bitmap;
	u64 bitmap_offset;
	unsigned long i;
	unsigned long j;
	unsigned long prev_j;
	u64 bytes;

	bitmap_offset = offset_to_bitmap(ctl, info->offset);
	/* If we're on a boundary, try the previous logical bitmap. */
	if (bitmap_offset == info->offset) {
		if (info->offset == 0)
			return false;
		bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
	}

	bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
	if (!bitmap)
		return false;

	i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
	j = 0;
	prev_j = (unsigned long)-1;
	for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
		if (j > i)
			break;
		prev_j = j;
	}
	if (prev_j == i)
		return false;

	if (prev_j == (unsigned long)-1)
		bytes = (i + 1) * ctl->unit;
	else
		bytes = (i - prev_j) * ctl->unit;

	info->offset -= bytes;
	info->bytes += bytes;

2420 2421 2422 2423
	/* See try_merge_free_space() comment. */
	if (!btrfs_free_space_trimmed(bitmap))
		info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;

2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470
	if (update_stat)
		bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
	else
		__bitmap_clear_bits(ctl, bitmap, info->offset, bytes);

	if (!bitmap->bytes)
		free_bitmap(ctl, bitmap);

	return true;
}

/*
 * We prefer always to allocate from extent entries, both for clustered and
 * non-clustered allocation requests. So when attempting to add a new extent
 * entry, try to see if there's adjacent free space in bitmap entries, and if
 * there is, migrate that space from the bitmaps to the extent.
 * Like this we get better chances of satisfying space allocation requests
 * because we attempt to satisfy them based on a single cache entry, and never
 * on 2 or more entries - even if the entries represent a contiguous free space
 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
 * ends).
 */
static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
			      struct btrfs_free_space *info,
			      bool update_stat)
{
	/*
	 * Only work with disconnected entries, as we can change their offset,
	 * and must be extent entries.
	 */
	ASSERT(!info->bitmap);
	ASSERT(RB_EMPTY_NODE(&info->offset_index));

	if (ctl->total_bitmaps > 0) {
		bool stole_end;
		bool stole_front = false;

		stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
		if (ctl->total_bitmaps > 0)
			stole_front = steal_from_bitmap_to_front(ctl, info,
								 update_stat);

		if (stole_end || stole_front)
			try_merge_free_space(ctl, info, update_stat);
	}
}

2471 2472
int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
			   struct btrfs_free_space_ctl *ctl,
2473 2474
			   u64 offset, u64 bytes,
			   enum btrfs_trim_state trim_state)
2475
{
2476
	struct btrfs_block_group *block_group = ctl->private;
2477 2478
	struct btrfs_free_space *info;
	int ret = 0;
D
Dennis Zhou 已提交
2479
	u64 filter_bytes = bytes;
2480

2481 2482
	ASSERT(!btrfs_is_zoned(fs_info));

2483
	info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
2484 2485 2486 2487 2488
	if (!info)
		return -ENOMEM;

	info->offset = offset;
	info->bytes = bytes;
2489
	info->trim_state = trim_state;
2490
	RB_CLEAR_NODE(&info->offset_index);
2491

2492
	spin_lock(&ctl->tree_lock);
2493

2494
	if (try_merge_free_space(ctl, info, true))
2495 2496 2497 2498 2499 2500 2501
		goto link;

	/*
	 * There was no extent directly to the left or right of this new
	 * extent then we know we're going to have to allocate a new extent, so
	 * before we do that see if we need to drop this into a bitmap
	 */
2502
	ret = insert_into_bitmap(ctl, info);
2503 2504 2505 2506 2507 2508 2509
	if (ret < 0) {
		goto out;
	} else if (ret) {
		ret = 0;
		goto out;
	}
link:
2510 2511 2512 2513 2514 2515 2516 2517
	/*
	 * Only steal free space from adjacent bitmaps if we're sure we're not
	 * going to add the new free space to existing bitmap entries - because
	 * that would mean unnecessary work that would be reverted. Therefore
	 * attempt to steal space from bitmaps if we're adding an extent entry.
	 */
	steal_from_bitmap(ctl, info, true);

D
Dennis Zhou 已提交
2518 2519
	filter_bytes = max(filter_bytes, info->bytes);

2520
	ret = link_free_space(ctl, info);
J
Josef Bacik 已提交
2521
	if (ret)
2522
		kmem_cache_free(btrfs_free_space_cachep, info);
2523
out:
2524
	btrfs_discard_update_discardable(block_group);
2525
	spin_unlock(&ctl->tree_lock);
2526

J
Josef Bacik 已提交
2527
	if (ret) {
2528
		btrfs_crit(fs_info, "unable to add free space :%d", ret);
2529
		ASSERT(ret != -EEXIST);
J
Josef Bacik 已提交
2530 2531
	}

D
Dennis Zhou 已提交
2532 2533
	if (trim_state != BTRFS_TRIM_STATE_TRIMMED) {
		btrfs_discard_check_filter(block_group, filter_bytes);
2534
		btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
D
Dennis Zhou 已提交
2535
	}
2536

J
Josef Bacik 已提交
2537 2538 2539
	return ret;
}

2540 2541 2542
static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
					u64 bytenr, u64 size, bool used)
{
2543
	struct btrfs_fs_info *fs_info = block_group->fs_info;
2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
	u64 offset = bytenr - block_group->start;
	u64 to_free, to_unusable;

	spin_lock(&ctl->tree_lock);
	if (!used)
		to_free = size;
	else if (offset >= block_group->alloc_offset)
		to_free = size;
	else if (offset + size <= block_group->alloc_offset)
		to_free = 0;
	else
		to_free = offset + size - block_group->alloc_offset;
	to_unusable = size - to_free;

	ctl->free_space += to_free;
2560 2561 2562 2563 2564 2565
	/*
	 * If the block group is read-only, we should account freed space into
	 * bytes_readonly.
	 */
	if (!block_group->ro)
		block_group->zone_unusable += to_unusable;
2566 2567 2568 2569 2570 2571 2572 2573
	spin_unlock(&ctl->tree_lock);
	if (!used) {
		spin_lock(&block_group->lock);
		block_group->alloc_offset -= size;
		spin_unlock(&block_group->lock);
	}

	/* All the region is now unusable. Mark it as unused and reclaim */
2574
	if (block_group->zone_unusable == block_group->length) {
2575
		btrfs_mark_bg_unused(block_group);
2576 2577 2578 2579 2580
	} else if (block_group->zone_unusable >=
		   div_factor_fine(block_group->length,
				   fs_info->bg_reclaim_threshold)) {
		btrfs_mark_bg_to_reclaim(block_group);
	}
2581 2582 2583 2584

	return 0;
}

2585
int btrfs_add_free_space(struct btrfs_block_group *block_group,
2586 2587
			 u64 bytenr, u64 size)
{
2588 2589
	enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;

2590 2591 2592 2593
	if (btrfs_is_zoned(block_group->fs_info))
		return __btrfs_add_free_space_zoned(block_group, bytenr, size,
						    true);

2594 2595 2596
	if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC))
		trim_state = BTRFS_TRIM_STATE_TRIMMED;

2597 2598
	return __btrfs_add_free_space(block_group->fs_info,
				      block_group->free_space_ctl,
2599
				      bytenr, size, trim_state);
2600 2601
}

2602 2603 2604 2605 2606 2607 2608 2609 2610 2611
int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
				u64 bytenr, u64 size)
{
	if (btrfs_is_zoned(block_group->fs_info))
		return __btrfs_add_free_space_zoned(block_group, bytenr, size,
						    false);

	return btrfs_add_free_space(block_group, bytenr, size);
}

2612 2613 2614 2615 2616 2617 2618 2619 2620 2621
/*
 * This is a subtle distinction because when adding free space back in general,
 * we want it to be added as untrimmed for async. But in the case where we add
 * it on loading of a block group, we want to consider it trimmed.
 */
int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
				       u64 bytenr, u64 size)
{
	enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;

2622 2623 2624 2625
	if (btrfs_is_zoned(block_group->fs_info))
		return __btrfs_add_free_space_zoned(block_group, bytenr, size,
						    true);

2626 2627 2628 2629 2630 2631 2632 2633 2634
	if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) ||
	    btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
		trim_state = BTRFS_TRIM_STATE_TRIMMED;

	return __btrfs_add_free_space(block_group->fs_info,
				      block_group->free_space_ctl,
				      bytenr, size, trim_state);
}

2635
int btrfs_remove_free_space(struct btrfs_block_group *block_group,
2636
			    u64 offset, u64 bytes)
J
Josef Bacik 已提交
2637
{
2638
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
J
Josef Bacik 已提交
2639
	struct btrfs_free_space *info;
2640 2641
	int ret;
	bool re_search = false;
J
Josef Bacik 已提交
2642

2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656
	if (btrfs_is_zoned(block_group->fs_info)) {
		/*
		 * This can happen with conventional zones when replaying log.
		 * Since the allocation info of tree-log nodes are not recorded
		 * to the extent-tree, calculate_alloc_pointer() failed to
		 * advance the allocation pointer after last allocated tree log
		 * node blocks.
		 *
		 * This function is called from
		 * btrfs_pin_extent_for_log_replay() when replaying the log.
		 * Advance the pointer not to overwrite the tree-log nodes.
		 */
		if (block_group->alloc_offset < offset + bytes)
			block_group->alloc_offset = offset + bytes;
2657
		return 0;
2658
	}
2659

2660
	spin_lock(&ctl->tree_lock);
2661

2662
again:
2663
	ret = 0;
2664 2665 2666
	if (!bytes)
		goto out_lock;

2667
	info = tree_search_offset(ctl, offset, 0, 0);
2668
	if (!info) {
2669 2670 2671 2672
		/*
		 * oops didn't find an extent that matched the space we wanted
		 * to remove, look for a bitmap instead
		 */
2673
		info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2674 2675
					  1, 0);
		if (!info) {
2676 2677 2678 2679
			/*
			 * If we found a partial bit of our free space in a
			 * bitmap but then couldn't find the other part this may
			 * be a problem, so WARN about it.
2680
			 */
2681
			WARN_ON(re_search);
2682 2683
			goto out_lock;
		}
2684 2685
	}

2686
	re_search = false;
2687
	if (!info->bitmap) {
2688
		unlink_free_space(ctl, info);
2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699
		if (offset == info->offset) {
			u64 to_free = min(bytes, info->bytes);

			info->bytes -= to_free;
			info->offset += to_free;
			if (info->bytes) {
				ret = link_free_space(ctl, info);
				WARN_ON(ret);
			} else {
				kmem_cache_free(btrfs_free_space_cachep, info);
			}
J
Josef Bacik 已提交
2700

2701 2702 2703 2704 2705
			offset += to_free;
			bytes -= to_free;
			goto again;
		} else {
			u64 old_end = info->bytes + info->offset;
2706

2707
			info->bytes = offset - info->offset;
2708
			ret = link_free_space(ctl, info);
2709 2710 2711 2712
			WARN_ON(ret);
			if (ret)
				goto out_lock;

2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723
			/* Not enough bytes in this entry to satisfy us */
			if (old_end < offset + bytes) {
				bytes -= old_end - offset;
				offset = old_end;
				goto again;
			} else if (old_end == offset + bytes) {
				/* all done */
				goto out_lock;
			}
			spin_unlock(&ctl->tree_lock);

2724 2725 2726 2727
			ret = __btrfs_add_free_space(block_group->fs_info, ctl,
						     offset + bytes,
						     old_end - (offset + bytes),
						     info->trim_state);
2728 2729 2730
			WARN_ON(ret);
			goto out;
		}
J
Josef Bacik 已提交
2731
	}
2732

2733
	ret = remove_from_bitmap(ctl, info, &offset, &bytes);
2734 2735
	if (ret == -EAGAIN) {
		re_search = true;
2736
		goto again;
2737
	}
2738
out_lock:
2739
	btrfs_discard_update_discardable(block_group);
2740
	spin_unlock(&ctl->tree_lock);
J
Josef Bacik 已提交
2741
out:
2742 2743 2744
	return ret;
}

2745
void btrfs_dump_free_space(struct btrfs_block_group *block_group,
J
Josef Bacik 已提交
2746 2747
			   u64 bytes)
{
2748
	struct btrfs_fs_info *fs_info = block_group->fs_info;
2749
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
J
Josef Bacik 已提交
2750 2751 2752 2753
	struct btrfs_free_space *info;
	struct rb_node *n;
	int count = 0;

2754 2755 2756 2757 2758 2759 2760 2761 2762 2763
	/*
	 * Zoned btrfs does not use free space tree and cluster. Just print
	 * out the free space after the allocation offset.
	 */
	if (btrfs_is_zoned(fs_info)) {
		btrfs_info(fs_info, "free space %llu",
			   block_group->length - block_group->alloc_offset);
		return;
	}

2764
	spin_lock(&ctl->tree_lock);
2765
	for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
J
Josef Bacik 已提交
2766
		info = rb_entry(n, struct btrfs_free_space, offset_index);
L
Liu Bo 已提交
2767
		if (info->bytes >= bytes && !block_group->ro)
J
Josef Bacik 已提交
2768
			count++;
2769
		btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
2770
			   info->offset, info->bytes,
2771
		       (info->bitmap) ? "yes" : "no");
J
Josef Bacik 已提交
2772
	}
2773
	spin_unlock(&ctl->tree_lock);
2774
	btrfs_info(fs_info, "block group has cluster?: %s",
2775
	       list_empty(&block_group->cluster_list) ? "no" : "yes");
2776
	btrfs_info(fs_info,
2777
		   "%d blocks of free space at or bigger than bytes is", count);
J
Josef Bacik 已提交
2778 2779
}

2780 2781
void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
			       struct btrfs_free_space_ctl *ctl)
J
Josef Bacik 已提交
2782
{
2783
	struct btrfs_fs_info *fs_info = block_group->fs_info;
J
Josef Bacik 已提交
2784

2785
	spin_lock_init(&ctl->tree_lock);
2786
	ctl->unit = fs_info->sectorsize;
2787
	ctl->start = block_group->start;
2788 2789
	ctl->private = block_group;
	ctl->op = &free_space_op;
2790 2791
	INIT_LIST_HEAD(&ctl->trimming_ranges);
	mutex_init(&ctl->cache_writeout_mutex);
J
Josef Bacik 已提交
2792

2793 2794 2795 2796 2797
	/*
	 * we only want to have 32k of ram per block group for keeping
	 * track of free space, and if we pass 1/2 of that we want to
	 * start converting things over to using bitmaps
	 */
2798
	ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
J
Josef Bacik 已提交
2799 2800
}

2801 2802 2803 2804 2805 2806
/*
 * for a given cluster, put all of its extents back into the free
 * space cache.  If the block group passed doesn't match the block group
 * pointed to by the cluster, someone else raced in and freed the
 * cluster already.  In that case, we just return without changing anything
 */
2807
static void __btrfs_return_cluster_to_free_space(
2808
			     struct btrfs_block_group *block_group,
2809 2810
			     struct btrfs_free_cluster *cluster)
{
2811
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2812 2813 2814 2815
	struct btrfs_free_space *entry;
	struct rb_node *node;

	spin_lock(&cluster->lock);
2816 2817 2818 2819
	if (cluster->block_group != block_group) {
		spin_unlock(&cluster->lock);
		return;
	}
2820

2821
	cluster->block_group = NULL;
2822
	cluster->window_start = 0;
2823 2824
	list_del_init(&cluster->block_group_list);

2825
	node = rb_first(&cluster->root);
2826
	while (node) {
2827 2828
		bool bitmap;

2829 2830 2831
		entry = rb_entry(node, struct btrfs_free_space, offset_index);
		node = rb_next(&entry->offset_index);
		rb_erase(&entry->offset_index, &cluster->root);
2832
		RB_CLEAR_NODE(&entry->offset_index);
2833 2834

		bitmap = (entry->bitmap != NULL);
2835
		if (!bitmap) {
2836
			/* Merging treats extents as if they were new */
2837
			if (!btrfs_free_space_trimmed(entry)) {
2838
				ctl->discardable_extents[BTRFS_STAT_CURR]--;
2839 2840 2841
				ctl->discardable_bytes[BTRFS_STAT_CURR] -=
					entry->bytes;
			}
2842

2843
			try_merge_free_space(ctl, entry, false);
2844
			steal_from_bitmap(ctl, entry, false);
2845 2846

			/* As we insert directly, update these statistics */
2847
			if (!btrfs_free_space_trimmed(entry)) {
2848
				ctl->discardable_extents[BTRFS_STAT_CURR]++;
2849 2850 2851
				ctl->discardable_bytes[BTRFS_STAT_CURR] +=
					entry->bytes;
			}
2852
		}
2853
		tree_insert_offset(&ctl->free_space_offset,
2854
				   entry->offset, &entry->offset_index, bitmap);
2855
	}
2856
	cluster->root = RB_ROOT;
2857
	spin_unlock(&cluster->lock);
2858
	btrfs_put_block_group(block_group);
2859 2860
}

2861 2862
static void __btrfs_remove_free_space_cache_locked(
				struct btrfs_free_space_ctl *ctl)
J
Josef Bacik 已提交
2863 2864 2865
{
	struct btrfs_free_space *info;
	struct rb_node *node;
2866 2867 2868

	while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
		info = rb_entry(node, struct btrfs_free_space, offset_index);
2869 2870 2871 2872 2873 2874
		if (!info->bitmap) {
			unlink_free_space(ctl, info);
			kmem_cache_free(btrfs_free_space_cachep, info);
		} else {
			free_bitmap(ctl, info);
		}
2875 2876

		cond_resched_lock(&ctl->tree_lock);
2877
	}
2878 2879 2880 2881 2882 2883
}

void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
{
	spin_lock(&ctl->tree_lock);
	__btrfs_remove_free_space_cache_locked(ctl);
2884
	if (ctl->private)
2885
		btrfs_discard_update_discardable(ctl->private);
2886 2887 2888
	spin_unlock(&ctl->tree_lock);
}

2889
void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
2890 2891
{
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2892
	struct btrfs_free_cluster *cluster;
2893
	struct list_head *head;
J
Josef Bacik 已提交
2894

2895
	spin_lock(&ctl->tree_lock);
2896 2897 2898 2899
	while ((head = block_group->cluster_list.next) !=
	       &block_group->cluster_list) {
		cluster = list_entry(head, struct btrfs_free_cluster,
				     block_group_list);
2900 2901 2902

		WARN_ON(cluster->block_group != block_group);
		__btrfs_return_cluster_to_free_space(block_group, cluster);
2903 2904

		cond_resched_lock(&ctl->tree_lock);
2905
	}
2906
	__btrfs_remove_free_space_cache_locked(ctl);
2907
	btrfs_discard_update_discardable(block_group);
2908
	spin_unlock(&ctl->tree_lock);
2909

J
Josef Bacik 已提交
2910 2911
}

2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942
/**
 * btrfs_is_free_space_trimmed - see if everything is trimmed
 * @block_group: block_group of interest
 *
 * Walk @block_group's free space rb_tree to determine if everything is trimmed.
 */
bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
{
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
	struct btrfs_free_space *info;
	struct rb_node *node;
	bool ret = true;

	spin_lock(&ctl->tree_lock);
	node = rb_first(&ctl->free_space_offset);

	while (node) {
		info = rb_entry(node, struct btrfs_free_space, offset_index);

		if (!btrfs_free_space_trimmed(info)) {
			ret = false;
			break;
		}

		node = rb_next(node);
	}

	spin_unlock(&ctl->tree_lock);
	return ret;
}

2943
u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
2944 2945
			       u64 offset, u64 bytes, u64 empty_size,
			       u64 *max_extent_size)
J
Josef Bacik 已提交
2946
{
2947
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2948 2949
	struct btrfs_discard_ctl *discard_ctl =
					&block_group->fs_info->discard_ctl;
2950
	struct btrfs_free_space *entry = NULL;
2951
	u64 bytes_search = bytes + empty_size;
2952
	u64 ret = 0;
D
David Woodhouse 已提交
2953 2954
	u64 align_gap = 0;
	u64 align_gap_len = 0;
2955
	enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
J
Josef Bacik 已提交
2956

2957 2958
	ASSERT(!btrfs_is_zoned(block_group->fs_info));

2959
	spin_lock(&ctl->tree_lock);
D
David Woodhouse 已提交
2960
	entry = find_free_space(ctl, &offset, &bytes_search,
2961
				block_group->full_stripe_len, max_extent_size);
2962
	if (!entry)
2963 2964 2965 2966
		goto out;

	ret = offset;
	if (entry->bitmap) {
2967
		bitmap_clear_bits(ctl, entry, offset, bytes);
2968 2969 2970 2971

		if (!btrfs_free_space_trimmed(entry))
			atomic64_add(bytes, &discard_ctl->discard_bytes_saved);

2972
		if (!entry->bytes)
2973
			free_bitmap(ctl, entry);
2974
	} else {
2975
		unlink_free_space(ctl, entry);
D
David Woodhouse 已提交
2976 2977
		align_gap_len = offset - entry->offset;
		align_gap = entry->offset;
2978
		align_gap_trim_state = entry->trim_state;
D
David Woodhouse 已提交
2979

2980 2981 2982
		if (!btrfs_free_space_trimmed(entry))
			atomic64_add(bytes, &discard_ctl->discard_bytes_saved);

D
David Woodhouse 已提交
2983 2984 2985 2986
		entry->offset = offset + bytes;
		WARN_ON(entry->bytes < bytes + align_gap_len);

		entry->bytes -= bytes + align_gap_len;
2987
		if (!entry->bytes)
2988
			kmem_cache_free(btrfs_free_space_cachep, entry);
2989
		else
2990
			link_free_space(ctl, entry);
2991
	}
2992
out:
2993
	btrfs_discard_update_discardable(block_group);
2994
	spin_unlock(&ctl->tree_lock);
J
Josef Bacik 已提交
2995

D
David Woodhouse 已提交
2996
	if (align_gap_len)
2997
		__btrfs_add_free_space(block_group->fs_info, ctl,
2998 2999
				       align_gap, align_gap_len,
				       align_gap_trim_state);
J
Josef Bacik 已提交
3000 3001
	return ret;
}
3002 3003 3004 3005 3006 3007 3008 3009 3010

/*
 * given a cluster, put all of its extents back into the free space
 * cache.  If a block group is passed, this function will only free
 * a cluster that belongs to the passed block group.
 *
 * Otherwise, it'll get a reference on the block group pointed to by the
 * cluster and remove the cluster from it.
 */
3011
void btrfs_return_cluster_to_free_space(
3012
			       struct btrfs_block_group *block_group,
3013 3014
			       struct btrfs_free_cluster *cluster)
{
3015
	struct btrfs_free_space_ctl *ctl;
3016 3017 3018 3019 3020 3021 3022

	/* first, get a safe pointer to the block group */
	spin_lock(&cluster->lock);
	if (!block_group) {
		block_group = cluster->block_group;
		if (!block_group) {
			spin_unlock(&cluster->lock);
3023
			return;
3024 3025 3026 3027
		}
	} else if (cluster->block_group != block_group) {
		/* someone else has already freed it don't redo their work */
		spin_unlock(&cluster->lock);
3028
		return;
3029
	}
3030
	btrfs_get_block_group(block_group);
3031 3032
	spin_unlock(&cluster->lock);

3033 3034
	ctl = block_group->free_space_ctl;

3035
	/* now return any extents the cluster had on it */
3036
	spin_lock(&ctl->tree_lock);
3037
	__btrfs_return_cluster_to_free_space(block_group, cluster);
3038
	spin_unlock(&ctl->tree_lock);
3039

3040 3041
	btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group);

3042 3043 3044 3045
	/* finally drop our ref */
	btrfs_put_block_group(block_group);
}

3046
static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
3047
				   struct btrfs_free_cluster *cluster,
3048
				   struct btrfs_free_space *entry,
3049 3050
				   u64 bytes, u64 min_start,
				   u64 *max_extent_size)
3051
{
3052
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3053 3054 3055 3056 3057 3058 3059 3060
	int err;
	u64 search_start = cluster->window_start;
	u64 search_bytes = bytes;
	u64 ret = 0;

	search_start = min_start;
	search_bytes = bytes;

3061
	err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
3062
	if (err) {
J
Josef Bacik 已提交
3063 3064
		*max_extent_size = max(get_max_extent_size(entry),
				       *max_extent_size);
3065
		return 0;
3066
	}
3067 3068

	ret = search_start;
3069
	__bitmap_clear_bits(ctl, entry, ret, bytes);
3070 3071 3072 3073

	return ret;
}

3074 3075 3076 3077 3078
/*
 * given a cluster, try to allocate 'bytes' from it, returns 0
 * if it couldn't find anything suitably large, or a logical disk offset
 * if things worked out
 */
3079
u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
3080
			     struct btrfs_free_cluster *cluster, u64 bytes,
3081
			     u64 min_start, u64 *max_extent_size)
3082
{
3083
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3084 3085
	struct btrfs_discard_ctl *discard_ctl =
					&block_group->fs_info->discard_ctl;
3086 3087 3088 3089
	struct btrfs_free_space *entry = NULL;
	struct rb_node *node;
	u64 ret = 0;

3090 3091
	ASSERT(!btrfs_is_zoned(block_group->fs_info));

3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103
	spin_lock(&cluster->lock);
	if (bytes > cluster->max_size)
		goto out;

	if (cluster->block_group != block_group)
		goto out;

	node = rb_first(&cluster->root);
	if (!node)
		goto out;

	entry = rb_entry(node, struct btrfs_free_space, offset_index);
3104
	while (1) {
J
Josef Bacik 已提交
3105 3106 3107
		if (entry->bytes < bytes)
			*max_extent_size = max(get_max_extent_size(entry),
					       *max_extent_size);
3108

3109 3110
		if (entry->bytes < bytes ||
		    (!entry->bitmap && entry->offset < min_start)) {
3111 3112 3113 3114 3115 3116 3117 3118
			node = rb_next(&entry->offset_index);
			if (!node)
				break;
			entry = rb_entry(node, struct btrfs_free_space,
					 offset_index);
			continue;
		}

3119 3120 3121
		if (entry->bitmap) {
			ret = btrfs_alloc_from_bitmap(block_group,
						      cluster, entry, bytes,
3122 3123
						      cluster->window_start,
						      max_extent_size);
3124 3125 3126 3127 3128 3129 3130 3131
			if (ret == 0) {
				node = rb_next(&entry->offset_index);
				if (!node)
					break;
				entry = rb_entry(node, struct btrfs_free_space,
						 offset_index);
				continue;
			}
3132
			cluster->window_start += bytes;
3133 3134 3135 3136 3137 3138
		} else {
			ret = entry->offset;

			entry->offset += bytes;
			entry->bytes -= bytes;
		}
3139 3140 3141 3142 3143

		break;
	}
out:
	spin_unlock(&cluster->lock);
3144

3145 3146 3147
	if (!ret)
		return 0;

3148
	spin_lock(&ctl->tree_lock);
3149

3150 3151 3152
	if (!btrfs_free_space_trimmed(entry))
		atomic64_add(bytes, &discard_ctl->discard_bytes_saved);

3153
	ctl->free_space -= bytes;
3154 3155
	if (!entry->bitmap && !btrfs_free_space_trimmed(entry))
		ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
3156 3157

	spin_lock(&cluster->lock);
3158
	if (entry->bytes == 0) {
3159
		rb_erase(&entry->offset_index, &cluster->root);
3160
		ctl->free_extents--;
3161
		if (entry->bitmap) {
3162 3163
			kmem_cache_free(btrfs_free_space_bitmap_cachep,
					entry->bitmap);
3164
			ctl->total_bitmaps--;
3165
			recalculate_thresholds(ctl);
3166 3167
		} else if (!btrfs_free_space_trimmed(entry)) {
			ctl->discardable_extents[BTRFS_STAT_CURR]--;
3168
		}
3169
		kmem_cache_free(btrfs_free_space_cachep, entry);
3170 3171
	}

3172
	spin_unlock(&cluster->lock);
3173
	spin_unlock(&ctl->tree_lock);
3174

3175 3176 3177
	return ret;
}

3178
static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
3179 3180
				struct btrfs_free_space *entry,
				struct btrfs_free_cluster *cluster,
3181 3182
				u64 offset, u64 bytes,
				u64 cont1_bytes, u64 min_bytes)
3183
{
3184
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3185 3186
	unsigned long next_zero;
	unsigned long i;
3187 3188
	unsigned long want_bits;
	unsigned long min_bits;
3189
	unsigned long found_bits;
3190
	unsigned long max_bits = 0;
3191 3192
	unsigned long start = 0;
	unsigned long total_found = 0;
3193
	int ret;
3194

3195
	i = offset_to_bit(entry->offset, ctl->unit,
3196
			  max_t(u64, offset, entry->offset));
3197 3198
	want_bits = bytes_to_bits(bytes, ctl->unit);
	min_bits = bytes_to_bits(min_bytes, ctl->unit);
3199

3200 3201 3202 3203 3204 3205 3206
	/*
	 * Don't bother looking for a cluster in this bitmap if it's heavily
	 * fragmented.
	 */
	if (entry->max_extent_size &&
	    entry->max_extent_size < cont1_bytes)
		return -ENOSPC;
3207 3208
again:
	found_bits = 0;
3209
	for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
3210 3211
		next_zero = find_next_zero_bit(entry->bitmap,
					       BITS_PER_BITMAP, i);
3212
		if (next_zero - i >= min_bits) {
3213
			found_bits = next_zero - i;
3214 3215
			if (found_bits > max_bits)
				max_bits = found_bits;
3216 3217
			break;
		}
3218 3219
		if (next_zero - i > max_bits)
			max_bits = next_zero - i;
3220 3221 3222
		i = next_zero;
	}

3223 3224
	if (!found_bits) {
		entry->max_extent_size = (u64)max_bits * ctl->unit;
3225
		return -ENOSPC;
3226
	}
3227

3228
	if (!total_found) {
3229
		start = i;
3230
		cluster->max_size = 0;
3231 3232 3233 3234
	}

	total_found += found_bits;

3235 3236
	if (cluster->max_size < found_bits * ctl->unit)
		cluster->max_size = found_bits * ctl->unit;
3237

3238 3239
	if (total_found < want_bits || cluster->max_size < cont1_bytes) {
		i = next_zero + 1;
3240 3241 3242
		goto again;
	}

3243
	cluster->window_start = start * ctl->unit + entry->offset;
3244
	rb_erase(&entry->offset_index, &ctl->free_space_offset);
3245 3246
	ret = tree_insert_offset(&cluster->root, entry->offset,
				 &entry->offset_index, 1);
3247
	ASSERT(!ret); /* -EEXIST; Logic error */
3248

J
Josef Bacik 已提交
3249
	trace_btrfs_setup_cluster(block_group, cluster,
3250
				  total_found * ctl->unit, 1);
3251 3252 3253
	return 0;
}

3254 3255
/*
 * This searches the block group for just extents to fill the cluster with.
3256 3257
 * Try to find a cluster with at least bytes total bytes, at least one
 * extent of cont1_bytes, and other clusters of at least min_bytes.
3258
 */
3259
static noinline int
3260
setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
3261 3262
			struct btrfs_free_cluster *cluster,
			struct list_head *bitmaps, u64 offset, u64 bytes,
3263
			u64 cont1_bytes, u64 min_bytes)
3264
{
3265
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3266 3267 3268 3269 3270 3271
	struct btrfs_free_space *first = NULL;
	struct btrfs_free_space *entry = NULL;
	struct btrfs_free_space *last;
	struct rb_node *node;
	u64 window_free;
	u64 max_extent;
J
Josef Bacik 已提交
3272
	u64 total_size = 0;
3273

3274
	entry = tree_search_offset(ctl, offset, 0, 1);
3275 3276 3277 3278 3279 3280 3281
	if (!entry)
		return -ENOSPC;

	/*
	 * We don't want bitmaps, so just move along until we find a normal
	 * extent entry.
	 */
3282 3283
	while (entry->bitmap || entry->bytes < min_bytes) {
		if (entry->bitmap && list_empty(&entry->list))
3284
			list_add_tail(&entry->list, bitmaps);
3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295
		node = rb_next(&entry->offset_index);
		if (!node)
			return -ENOSPC;
		entry = rb_entry(node, struct btrfs_free_space, offset_index);
	}

	window_free = entry->bytes;
	max_extent = entry->bytes;
	first = entry;
	last = entry;

3296 3297
	for (node = rb_next(&entry->offset_index); node;
	     node = rb_next(&entry->offset_index)) {
3298 3299
		entry = rb_entry(node, struct btrfs_free_space, offset_index);

3300 3301 3302
		if (entry->bitmap) {
			if (list_empty(&entry->list))
				list_add_tail(&entry->list, bitmaps);
3303
			continue;
3304 3305
		}

3306 3307 3308 3309 3310 3311
		if (entry->bytes < min_bytes)
			continue;

		last = entry;
		window_free += entry->bytes;
		if (entry->bytes > max_extent)
3312 3313 3314
			max_extent = entry->bytes;
	}

3315 3316 3317
	if (window_free < bytes || max_extent < cont1_bytes)
		return -ENOSPC;

3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330
	cluster->window_start = first->offset;

	node = &first->offset_index;

	/*
	 * now we've found our entries, pull them out of the free space
	 * cache and put them into the cluster rbtree
	 */
	do {
		int ret;

		entry = rb_entry(node, struct btrfs_free_space, offset_index);
		node = rb_next(&entry->offset_index);
3331
		if (entry->bitmap || entry->bytes < min_bytes)
3332 3333
			continue;

3334
		rb_erase(&entry->offset_index, &ctl->free_space_offset);
3335 3336
		ret = tree_insert_offset(&cluster->root, entry->offset,
					 &entry->offset_index, 0);
J
Josef Bacik 已提交
3337
		total_size += entry->bytes;
3338
		ASSERT(!ret); /* -EEXIST; Logic error */
3339 3340 3341
	} while (node && entry != last);

	cluster->max_size = max_extent;
J
Josef Bacik 已提交
3342
	trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
3343 3344 3345 3346 3347 3348 3349
	return 0;
}

/*
 * This specifically looks for bitmaps that may work in the cluster, we assume
 * that we have already failed to find extents that will work.
 */
3350
static noinline int
3351
setup_cluster_bitmap(struct btrfs_block_group *block_group,
3352 3353
		     struct btrfs_free_cluster *cluster,
		     struct list_head *bitmaps, u64 offset, u64 bytes,
3354
		     u64 cont1_bytes, u64 min_bytes)
3355
{
3356
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3357
	struct btrfs_free_space *entry = NULL;
3358
	int ret = -ENOSPC;
3359
	u64 bitmap_offset = offset_to_bitmap(ctl, offset);
3360

3361
	if (ctl->total_bitmaps == 0)
3362 3363
		return -ENOSPC;

3364 3365 3366 3367
	/*
	 * The bitmap that covers offset won't be in the list unless offset
	 * is just its start offset.
	 */
3368 3369 3370 3371
	if (!list_empty(bitmaps))
		entry = list_first_entry(bitmaps, struct btrfs_free_space, list);

	if (!entry || entry->offset != bitmap_offset) {
3372 3373 3374 3375 3376
		entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
		if (entry && list_empty(&entry->list))
			list_add(&entry->list, bitmaps);
	}

3377
	list_for_each_entry(entry, bitmaps, list) {
3378
		if (entry->bytes < bytes)
3379 3380
			continue;
		ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
3381
					   bytes, cont1_bytes, min_bytes);
3382 3383 3384 3385 3386
		if (!ret)
			return 0;
	}

	/*
3387 3388
	 * The bitmaps list has all the bitmaps that record free space
	 * starting after offset, so no more search is required.
3389
	 */
3390
	return -ENOSPC;
3391 3392
}

3393 3394
/*
 * here we try to find a cluster of blocks in a block group.  The goal
3395
 * is to find at least bytes+empty_size.
3396 3397 3398 3399 3400
 * We might not find them all in one contiguous area.
 *
 * returns zero and sets up cluster if things worked out, otherwise
 * it returns -enospc
 */
3401
int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
3402 3403 3404
			     struct btrfs_free_cluster *cluster,
			     u64 offset, u64 bytes, u64 empty_size)
{
3405
	struct btrfs_fs_info *fs_info = block_group->fs_info;
3406
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3407
	struct btrfs_free_space *entry, *tmp;
3408
	LIST_HEAD(bitmaps);
3409
	u64 min_bytes;
3410
	u64 cont1_bytes;
3411 3412
	int ret;

3413 3414 3415 3416 3417 3418
	/*
	 * Choose the minimum extent size we'll require for this
	 * cluster.  For SSD_SPREAD, don't allow any fragmentation.
	 * For metadata, allow allocates with smaller extents.  For
	 * data, keep it dense.
	 */
3419
	if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
3420
		cont1_bytes = min_bytes = bytes + empty_size;
3421
	} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
3422
		cont1_bytes = bytes;
3423
		min_bytes = fs_info->sectorsize;
3424 3425
	} else {
		cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
3426
		min_bytes = fs_info->sectorsize;
3427
	}
3428

3429
	spin_lock(&ctl->tree_lock);
3430 3431 3432 3433 3434

	/*
	 * If we know we don't have enough space to make a cluster don't even
	 * bother doing all the work to try and find one.
	 */
3435
	if (ctl->free_space < bytes) {
3436
		spin_unlock(&ctl->tree_lock);
3437 3438 3439
		return -ENOSPC;
	}

3440 3441 3442 3443 3444 3445 3446 3447
	spin_lock(&cluster->lock);

	/* someone already found a cluster, hooray */
	if (cluster->block_group) {
		ret = 0;
		goto out;
	}

J
Josef Bacik 已提交
3448 3449 3450
	trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
				 min_bytes);

3451
	ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
3452 3453
				      bytes + empty_size,
				      cont1_bytes, min_bytes);
3454
	if (ret)
3455
		ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
3456 3457
					   offset, bytes + empty_size,
					   cont1_bytes, min_bytes);
3458 3459 3460 3461

	/* Clear our temporary list */
	list_for_each_entry_safe(entry, tmp, &bitmaps, list)
		list_del_init(&entry->list);
3462

3463
	if (!ret) {
3464
		btrfs_get_block_group(block_group);
3465 3466 3467
		list_add_tail(&cluster->block_group_list,
			      &block_group->cluster_list);
		cluster->block_group = block_group;
J
Josef Bacik 已提交
3468 3469
	} else {
		trace_btrfs_failed_cluster_setup(block_group);
3470 3471 3472
	}
out:
	spin_unlock(&cluster->lock);
3473
	spin_unlock(&ctl->tree_lock);
3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484

	return ret;
}

/*
 * simple code to zero out a cluster
 */
void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
{
	spin_lock_init(&cluster->lock);
	spin_lock_init(&cluster->refill_lock);
3485
	cluster->root = RB_ROOT;
3486
	cluster->max_size = 0;
3487
	cluster->fragmented = false;
3488 3489 3490 3491
	INIT_LIST_HEAD(&cluster->block_group_list);
	cluster->block_group = NULL;
}

3492
static int do_trimming(struct btrfs_block_group *block_group,
3493
		       u64 *total_trimmed, u64 start, u64 bytes,
3494
		       u64 reserved_start, u64 reserved_bytes,
3495
		       enum btrfs_trim_state reserved_trim_state,
3496
		       struct btrfs_trim_range *trim_entry)
3497
{
3498
	struct btrfs_space_info *space_info = block_group->space_info;
3499
	struct btrfs_fs_info *fs_info = block_group->fs_info;
3500
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3501 3502
	int ret;
	int update = 0;
3503 3504 3505
	const u64 end = start + bytes;
	const u64 reserved_end = reserved_start + reserved_bytes;
	enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3506
	u64 trimmed = 0;
3507

3508 3509 3510 3511 3512 3513 3514 3515 3516 3517
	spin_lock(&space_info->lock);
	spin_lock(&block_group->lock);
	if (!block_group->ro) {
		block_group->reserved += reserved_bytes;
		space_info->bytes_reserved += reserved_bytes;
		update = 1;
	}
	spin_unlock(&block_group->lock);
	spin_unlock(&space_info->lock);

3518
	ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
3519
	if (!ret) {
3520
		*total_trimmed += trimmed;
3521 3522
		trim_state = BTRFS_TRIM_STATE_TRIMMED;
	}
3523

3524
	mutex_lock(&ctl->cache_writeout_mutex);
3525 3526 3527 3528 3529 3530 3531 3532
	if (reserved_start < start)
		__btrfs_add_free_space(fs_info, ctl, reserved_start,
				       start - reserved_start,
				       reserved_trim_state);
	if (start + bytes < reserved_start + reserved_bytes)
		__btrfs_add_free_space(fs_info, ctl, end, reserved_end - end,
				       reserved_trim_state);
	__btrfs_add_free_space(fs_info, ctl, start, bytes, trim_state);
3533 3534
	list_del(&trim_entry->list);
	mutex_unlock(&ctl->cache_writeout_mutex);
3535 3536 3537 3538 3539 3540 3541 3542 3543

	if (update) {
		spin_lock(&space_info->lock);
		spin_lock(&block_group->lock);
		if (block_group->ro)
			space_info->bytes_readonly += reserved_bytes;
		block_group->reserved -= reserved_bytes;
		space_info->bytes_reserved -= reserved_bytes;
		spin_unlock(&block_group->lock);
3544
		spin_unlock(&space_info->lock);
3545 3546 3547 3548 3549
	}

	return ret;
}

3550 3551 3552
/*
 * If @async is set, then we will trim 1 region and return.
 */
3553
static int trim_no_bitmap(struct btrfs_block_group *block_group,
3554 3555
			  u64 *total_trimmed, u64 start, u64 end, u64 minlen,
			  bool async)
3556
{
3557 3558
	struct btrfs_discard_ctl *discard_ctl =
					&block_group->fs_info->discard_ctl;
3559 3560 3561 3562 3563 3564
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
	struct btrfs_free_space *entry;
	struct rb_node *node;
	int ret = 0;
	u64 extent_start;
	u64 extent_bytes;
3565
	enum btrfs_trim_state extent_trim_state;
3566
	u64 bytes;
3567
	const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
3568 3569

	while (start < end) {
3570 3571 3572
		struct btrfs_trim_range trim_entry;

		mutex_lock(&ctl->cache_writeout_mutex);
3573
		spin_lock(&ctl->tree_lock);
3574

3575 3576
		if (ctl->free_space < minlen)
			goto out_unlock;
3577

3578
		entry = tree_search_offset(ctl, start, 0, 1);
3579 3580
		if (!entry)
			goto out_unlock;
3581

3582 3583 3584
		/* Skip bitmaps and if async, already trimmed entries */
		while (entry->bitmap ||
		       (async && btrfs_free_space_trimmed(entry))) {
3585
			node = rb_next(&entry->offset_index);
3586 3587
			if (!node)
				goto out_unlock;
3588 3589
			entry = rb_entry(node, struct btrfs_free_space,
					 offset_index);
3590 3591
		}

3592 3593
		if (entry->offset >= end)
			goto out_unlock;
3594

3595 3596
		extent_start = entry->offset;
		extent_bytes = entry->bytes;
3597
		extent_trim_state = entry->trim_state;
3598 3599 3600 3601 3602 3603 3604 3605 3606
		if (async) {
			start = entry->offset;
			bytes = entry->bytes;
			if (bytes < minlen) {
				spin_unlock(&ctl->tree_lock);
				mutex_unlock(&ctl->cache_writeout_mutex);
				goto next;
			}
			unlink_free_space(ctl, entry);
D
Dennis Zhou 已提交
3607 3608 3609 3610 3611 3612 3613 3614
			/*
			 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
			 * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim
			 * X when we come back around.  So trim it now.
			 */
			if (max_discard_size &&
			    bytes >= (max_discard_size +
				      BTRFS_ASYNC_DISCARD_MIN_FILTER)) {
3615 3616 3617 3618
				bytes = max_discard_size;
				extent_bytes = max_discard_size;
				entry->offset += max_discard_size;
				entry->bytes -= max_discard_size;
3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630
				link_free_space(ctl, entry);
			} else {
				kmem_cache_free(btrfs_free_space_cachep, entry);
			}
		} else {
			start = max(start, extent_start);
			bytes = min(extent_start + extent_bytes, end) - start;
			if (bytes < minlen) {
				spin_unlock(&ctl->tree_lock);
				mutex_unlock(&ctl->cache_writeout_mutex);
				goto next;
			}
3631

3632 3633 3634
			unlink_free_space(ctl, entry);
			kmem_cache_free(btrfs_free_space_cachep, entry);
		}
3635

3636
		spin_unlock(&ctl->tree_lock);
3637 3638 3639 3640
		trim_entry.start = extent_start;
		trim_entry.bytes = extent_bytes;
		list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
		mutex_unlock(&ctl->cache_writeout_mutex);
3641

3642
		ret = do_trimming(block_group, total_trimmed, start, bytes,
3643 3644
				  extent_start, extent_bytes, extent_trim_state,
				  &trim_entry);
3645 3646
		if (ret) {
			block_group->discard_cursor = start + bytes;
3647
			break;
3648
		}
3649 3650
next:
		start += bytes;
3651 3652 3653
		block_group->discard_cursor = start;
		if (async && *total_trimmed)
			break;
3654

3655 3656 3657 3658 3659 3660 3661
		if (fatal_signal_pending(current)) {
			ret = -ERESTARTSYS;
			break;
		}

		cond_resched();
	}
3662 3663 3664 3665 3666 3667 3668 3669

	return ret;

out_unlock:
	block_group->discard_cursor = btrfs_block_group_end(block_group);
	spin_unlock(&ctl->tree_lock);
	mutex_unlock(&ctl->cache_writeout_mutex);

3670 3671 3672
	return ret;
}

3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692
/*
 * If we break out of trimming a bitmap prematurely, we should reset the
 * trimming bit.  In a rather contrieved case, it's possible to race here so
 * reset the state to BTRFS_TRIM_STATE_UNTRIMMED.
 *
 * start = start of bitmap
 * end = near end of bitmap
 *
 * Thread 1:			Thread 2:
 * trim_bitmaps(start)
 *				trim_bitmaps(end)
 *				end_trimming_bitmap()
 * reset_trimming_bitmap()
 */
static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset)
{
	struct btrfs_free_space *entry;

	spin_lock(&ctl->tree_lock);
	entry = tree_search_offset(ctl, offset, 1, 0);
3693
	if (entry) {
3694
		if (btrfs_free_space_trimmed(entry)) {
3695 3696
			ctl->discardable_extents[BTRFS_STAT_CURR] +=
				entry->bitmap_extents;
3697 3698
			ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes;
		}
3699
		entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3700 3701
	}

3702 3703 3704
	spin_unlock(&ctl->tree_lock);
}

3705 3706
static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl,
				struct btrfs_free_space *entry)
3707
{
3708
	if (btrfs_free_space_trimming_bitmap(entry)) {
3709
		entry->trim_state = BTRFS_TRIM_STATE_TRIMMED;
3710 3711
		ctl->discardable_extents[BTRFS_STAT_CURR] -=
			entry->bitmap_extents;
3712
		ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes;
3713
	}
3714 3715
}

3716 3717 3718
/*
 * If @async is set, then we will trim 1 region and return.
 */
3719
static int trim_bitmaps(struct btrfs_block_group *block_group,
3720
			u64 *total_trimmed, u64 start, u64 end, u64 minlen,
D
Dennis Zhou 已提交
3721
			u64 maxlen, bool async)
3722
{
3723 3724
	struct btrfs_discard_ctl *discard_ctl =
					&block_group->fs_info->discard_ctl;
3725 3726 3727 3728 3729 3730
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
	struct btrfs_free_space *entry;
	int ret = 0;
	int ret2;
	u64 bytes;
	u64 offset = offset_to_bitmap(ctl, start);
3731
	const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
3732 3733 3734

	while (offset < end) {
		bool next_bitmap = false;
3735
		struct btrfs_trim_range trim_entry;
3736

3737
		mutex_lock(&ctl->cache_writeout_mutex);
3738 3739 3740
		spin_lock(&ctl->tree_lock);

		if (ctl->free_space < minlen) {
3741 3742
			block_group->discard_cursor =
				btrfs_block_group_end(block_group);
3743
			spin_unlock(&ctl->tree_lock);
3744
			mutex_unlock(&ctl->cache_writeout_mutex);
3745 3746 3747 3748
			break;
		}

		entry = tree_search_offset(ctl, offset, 1, 0);
D
Dennis Zhou 已提交
3749 3750 3751 3752 3753 3754 3755 3756 3757
		/*
		 * Bitmaps are marked trimmed lossily now to prevent constant
		 * discarding of the same bitmap (the reason why we are bound
		 * by the filters).  So, retrim the block group bitmaps when we
		 * are preparing to punt to the unused_bgs list.  This uses
		 * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED
		 * which is the only discard index which sets minlen to 0.
		 */
		if (!entry || (async && minlen && start == offset &&
3758
			       btrfs_free_space_trimmed(entry))) {
3759
			spin_unlock(&ctl->tree_lock);
3760
			mutex_unlock(&ctl->cache_writeout_mutex);
3761 3762 3763 3764
			next_bitmap = true;
			goto next;
		}

3765 3766 3767 3768 3769 3770 3771 3772 3773
		/*
		 * Async discard bitmap trimming begins at by setting the start
		 * to be key.objectid and the offset_to_bitmap() aligns to the
		 * start of the bitmap.  This lets us know we are fully
		 * scanning the bitmap rather than only some portion of it.
		 */
		if (start == offset)
			entry->trim_state = BTRFS_TRIM_STATE_TRIMMING;

3774
		bytes = minlen;
3775
		ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
3776
		if (ret2 || start >= end) {
3777
			/*
D
Dennis Zhou 已提交
3778 3779
			 * We lossily consider a bitmap trimmed if we only skip
			 * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER.
3780
			 */
D
Dennis Zhou 已提交
3781
			if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER)
3782
				end_trimming_bitmap(ctl, entry);
3783 3784
			else
				entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3785
			spin_unlock(&ctl->tree_lock);
3786
			mutex_unlock(&ctl->cache_writeout_mutex);
3787 3788 3789 3790
			next_bitmap = true;
			goto next;
		}

3791 3792 3793 3794 3795 3796 3797 3798 3799 3800
		/*
		 * We already trimmed a region, but are using the locking above
		 * to reset the trim_state.
		 */
		if (async && *total_trimmed) {
			spin_unlock(&ctl->tree_lock);
			mutex_unlock(&ctl->cache_writeout_mutex);
			goto out;
		}

3801
		bytes = min(bytes, end - start);
D
Dennis Zhou 已提交
3802
		if (bytes < minlen || (async && maxlen && bytes > maxlen)) {
3803
			spin_unlock(&ctl->tree_lock);
3804
			mutex_unlock(&ctl->cache_writeout_mutex);
3805 3806 3807
			goto next;
		}

D
Dennis Zhou 已提交
3808 3809 3810 3811 3812 3813 3814 3815 3816
		/*
		 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
		 * If X < @minlen, we won't trim X when we come back around.
		 * So trim it now.  We differ here from trimming extents as we
		 * don't keep individual state per bit.
		 */
		if (async &&
		    max_discard_size &&
		    bytes > (max_discard_size + minlen))
3817
			bytes = max_discard_size;
3818

3819 3820 3821 3822 3823
		bitmap_clear_bits(ctl, entry, start, bytes);
		if (entry->bytes == 0)
			free_bitmap(ctl, entry);

		spin_unlock(&ctl->tree_lock);
3824 3825 3826 3827
		trim_entry.start = start;
		trim_entry.bytes = bytes;
		list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
		mutex_unlock(&ctl->cache_writeout_mutex);
3828 3829

		ret = do_trimming(block_group, total_trimmed, start, bytes,
3830
				  start, bytes, 0, &trim_entry);
3831 3832
		if (ret) {
			reset_trimming_bitmap(ctl, offset);
3833 3834
			block_group->discard_cursor =
				btrfs_block_group_end(block_group);
3835
			break;
3836
		}
3837 3838 3839
next:
		if (next_bitmap) {
			offset += BITS_PER_BITMAP * ctl->unit;
3840
			start = offset;
3841 3842
		} else {
			start += bytes;
3843
		}
3844
		block_group->discard_cursor = start;
3845 3846

		if (fatal_signal_pending(current)) {
3847 3848
			if (start != offset)
				reset_trimming_bitmap(ctl, offset);
3849 3850 3851 3852 3853 3854 3855
			ret = -ERESTARTSYS;
			break;
		}

		cond_resched();
	}

3856 3857 3858 3859
	if (offset >= end)
		block_group->discard_cursor = end;

out:
3860 3861
	return ret;
}
3862

3863
int btrfs_trim_block_group(struct btrfs_block_group *block_group,
3864 3865
			   u64 *trimmed, u64 start, u64 end, u64 minlen)
{
3866
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3867
	int ret;
3868
	u64 rem = 0;
3869

3870 3871
	ASSERT(!btrfs_is_zoned(block_group->fs_info));

3872 3873 3874 3875
	*trimmed = 0;

	spin_lock(&block_group->lock);
	if (block_group->removed) {
3876
		spin_unlock(&block_group->lock);
3877
		return 0;
3878
	}
3879
	btrfs_freeze_block_group(block_group);
3880 3881
	spin_unlock(&block_group->lock);

3882
	ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false);
3883 3884
	if (ret)
		goto out;
3885

D
Dennis Zhou 已提交
3886
	ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false);
3887 3888 3889 3890
	div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem);
	/* If we ended in the middle of a bitmap, reset the trimming flag */
	if (rem)
		reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end));
3891
out:
3892
	btrfs_unfreeze_block_group(block_group);
3893 3894 3895
	return ret;
}

3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908
int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
				   u64 *trimmed, u64 start, u64 end, u64 minlen,
				   bool async)
{
	int ret;

	*trimmed = 0;

	spin_lock(&block_group->lock);
	if (block_group->removed) {
		spin_unlock(&block_group->lock);
		return 0;
	}
3909
	btrfs_freeze_block_group(block_group);
3910 3911 3912
	spin_unlock(&block_group->lock);

	ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async);
3913
	btrfs_unfreeze_block_group(block_group);
3914 3915 3916 3917 3918 3919

	return ret;
}

int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
				   u64 *trimmed, u64 start, u64 end, u64 minlen,
D
Dennis Zhou 已提交
3920
				   u64 maxlen, bool async)
3921 3922 3923 3924 3925 3926 3927 3928 3929 3930
{
	int ret;

	*trimmed = 0;

	spin_lock(&block_group->lock);
	if (block_group->removed) {
		spin_unlock(&block_group->lock);
		return 0;
	}
3931
	btrfs_freeze_block_group(block_group);
3932 3933
	spin_unlock(&block_group->lock);

D
Dennis Zhou 已提交
3934 3935 3936
	ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen,
			   async);

3937
	btrfs_unfreeze_block_group(block_group);
3938 3939 3940 3941

	return ret;
}

3942 3943 3944 3945 3946
bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info)
{
	return btrfs_super_cache_generation(fs_info->super_copy);
}

3947 3948 3949 3950 3951
static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info,
				       struct btrfs_trans_handle *trans)
{
	struct btrfs_block_group *block_group;
	struct rb_node *node;
3952
	int ret = 0;
3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967

	btrfs_info(fs_info, "cleaning free space cache v1");

	node = rb_first(&fs_info->block_group_cache_tree);
	while (node) {
		block_group = rb_entry(node, struct btrfs_block_group, cache_node);
		ret = btrfs_remove_free_space_inode(trans, NULL, block_group);
		if (ret)
			goto out;
		node = rb_next(node);
	}
out:
	return ret;
}

3968 3969 3970 3971 3972 3973
int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active)
{
	struct btrfs_trans_handle *trans;
	int ret;

	/*
3974 3975 3976 3977 3978 3979
	 * update_super_roots will appropriately set or unset
	 * super_copy->cache_generation based on SPACE_CACHE and
	 * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a
	 * transaction commit whether we are enabling space cache v1 and don't
	 * have any other work to do, or are disabling it and removing free
	 * space inodes.
3980 3981 3982 3983 3984
	 */
	trans = btrfs_start_transaction(fs_info->tree_root, 0);
	if (IS_ERR(trans))
		return PTR_ERR(trans);

3985
	if (!active) {
3986
		set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
3987 3988 3989 3990 3991 3992 3993
		ret = cleanup_free_space_cache_v1(fs_info, trans);
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			btrfs_end_transaction(trans);
			goto out;
		}
	}
3994 3995

	ret = btrfs_commit_transaction(trans);
3996
out:
3997 3998 3999 4000 4001
	clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);

	return ret;
}

4002
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4003 4004 4005 4006 4007 4008
/*
 * Use this if you need to make a bitmap or extent entry specifically, it
 * doesn't do any of the merging that add_free_space does, this acts a lot like
 * how the free space cache loading stuff works, so you can get really weird
 * configurations.
 */
4009
int test_add_free_space_entry(struct btrfs_block_group *cache,
4010
			      u64 offset, u64 bytes, bool bitmap)
4011
{
4012 4013 4014
	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
	struct btrfs_free_space *info = NULL, *bitmap_info;
	void *map = NULL;
4015
	enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED;
4016 4017
	u64 bytes_added;
	int ret;
4018

4019 4020 4021 4022 4023
again:
	if (!info) {
		info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
		if (!info)
			return -ENOMEM;
4024 4025
	}

4026 4027 4028 4029
	if (!bitmap) {
		spin_lock(&ctl->tree_lock);
		info->offset = offset;
		info->bytes = bytes;
4030
		info->max_extent_size = 0;
4031 4032 4033 4034 4035 4036 4037 4038
		ret = link_free_space(ctl, info);
		spin_unlock(&ctl->tree_lock);
		if (ret)
			kmem_cache_free(btrfs_free_space_cachep, info);
		return ret;
	}

	if (!map) {
4039
		map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS);
4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053
		if (!map) {
			kmem_cache_free(btrfs_free_space_cachep, info);
			return -ENOMEM;
		}
	}

	spin_lock(&ctl->tree_lock);
	bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
					 1, 0);
	if (!bitmap_info) {
		info->bitmap = map;
		map = NULL;
		add_new_bitmap(ctl, info, offset);
		bitmap_info = info;
4054
		info = NULL;
4055
	}
4056

4057 4058
	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
					  trim_state);
4059

4060 4061 4062
	bytes -= bytes_added;
	offset += bytes_added;
	spin_unlock(&ctl->tree_lock);
4063

4064 4065
	if (bytes)
		goto again;
4066

4067 4068
	if (info)
		kmem_cache_free(btrfs_free_space_cachep, info);
4069 4070
	if (map)
		kmem_cache_free(btrfs_free_space_bitmap_cachep, map);
4071
	return 0;
4072 4073 4074 4075 4076 4077 4078
}

/*
 * Checks to see if the given range is in the free space cache.  This is really
 * just used to check the absence of space, so if there is free space in the
 * range at all we will return 1.
 */
4079
int test_check_exists(struct btrfs_block_group *cache,
4080
		      u64 offset, u64 bytes)
4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102
{
	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
	struct btrfs_free_space *info;
	int ret = 0;

	spin_lock(&ctl->tree_lock);
	info = tree_search_offset(ctl, offset, 0, 0);
	if (!info) {
		info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
					  1, 0);
		if (!info)
			goto out;
	}

have_info:
	if (info->bitmap) {
		u64 bit_off, bit_bytes;
		struct rb_node *n;
		struct btrfs_free_space *tmp;

		bit_off = offset;
		bit_bytes = ctl->unit;
4103
		ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121
		if (!ret) {
			if (bit_off == offset) {
				ret = 1;
				goto out;
			} else if (bit_off > offset &&
				   offset + bytes > bit_off) {
				ret = 1;
				goto out;
			}
		}

		n = rb_prev(&info->offset_index);
		while (n) {
			tmp = rb_entry(n, struct btrfs_free_space,
				       offset_index);
			if (tmp->offset + tmp->bytes < offset)
				break;
			if (offset + bytes < tmp->offset) {
4122
				n = rb_prev(&tmp->offset_index);
4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135
				continue;
			}
			info = tmp;
			goto have_info;
		}

		n = rb_next(&info->offset_index);
		while (n) {
			tmp = rb_entry(n, struct btrfs_free_space,
				       offset_index);
			if (offset + bytes < tmp->offset)
				break;
			if (tmp->offset + tmp->bytes < offset) {
4136
				n = rb_next(&tmp->offset_index);
4137 4138 4139 4140 4141 4142
				continue;
			}
			info = tmp;
			goto have_info;
		}

4143
		ret = 0;
4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157
		goto out;
	}

	if (info->offset == offset) {
		ret = 1;
		goto out;
	}

	if (offset > info->offset && offset < info->offset + info->bytes)
		ret = 1;
out:
	spin_unlock(&ctl->tree_lock);
	return ret;
}
4158
#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */