free-space-cache.c 107.3 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
J
Josef Bacik 已提交
2 3 4 5
/*
 * Copyright (C) 2008 Red Hat.  All rights reserved.
 */

6
#include <linux/pagemap.h>
J
Josef Bacik 已提交
7
#include <linux/sched.h>
8
#include <linux/sched/signal.h>
9
#include <linux/slab.h>
10
#include <linux/math64.h>
11
#include <linux/ratelimit.h>
12
#include <linux/error-injection.h>
13
#include <linux/sched/mm.h>
J
Josef Bacik 已提交
14
#include "ctree.h"
15 16
#include "free-space-cache.h"
#include "transaction.h"
17
#include "disk-io.h"
18
#include "extent_io.h"
19
#include "volumes.h"
20
#include "space-info.h"
21
#include "delalloc-space.h"
22
#include "block-group.h"
23
#include "discard.h"
24

25
#define BITS_PER_BITMAP		(PAGE_SIZE * 8UL)
26 27
#define MAX_CACHE_BYTES_PER_GIG	SZ_64K
#define FORCE_EXTENT_THRESHOLD	SZ_1M
J
Josef Bacik 已提交
28

29 30 31 32 33 34
struct btrfs_trim_range {
	u64 start;
	u64 bytes;
	struct list_head list;
};

35
static int link_free_space(struct btrfs_free_space_ctl *ctl,
J
Josef Bacik 已提交
36
			   struct btrfs_free_space *info);
37 38
static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
			      struct btrfs_free_space *info);
39 40 41 42 43 44 45 46
static int search_bitmap(struct btrfs_free_space_ctl *ctl,
			 struct btrfs_free_space *bitmap_info, u64 *offset,
			 u64 *bytes, bool for_alloc);
static void free_bitmap(struct btrfs_free_space_ctl *ctl,
			struct btrfs_free_space *bitmap_info);
static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
			      struct btrfs_free_space *info, u64 offset,
			      u64 bytes);
J
Josef Bacik 已提交
47

48 49 50
static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
					       struct btrfs_path *path,
					       u64 offset)
51
{
52
	struct btrfs_fs_info *fs_info = root->fs_info;
53 54 55 56 57 58
	struct btrfs_key key;
	struct btrfs_key location;
	struct btrfs_disk_key disk_key;
	struct btrfs_free_space_header *header;
	struct extent_buffer *leaf;
	struct inode *inode = NULL;
59
	unsigned nofs_flag;
60 61 62
	int ret;

	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
63
	key.offset = offset;
64 65 66 67 68 69
	key.type = 0;

	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0)
		return ERR_PTR(ret);
	if (ret > 0) {
70
		btrfs_release_path(path);
71 72 73 74 75 76 77 78
		return ERR_PTR(-ENOENT);
	}

	leaf = path->nodes[0];
	header = btrfs_item_ptr(leaf, path->slots[0],
				struct btrfs_free_space_header);
	btrfs_free_space_key(leaf, header, &disk_key);
	btrfs_disk_key_to_cpu(&location, &disk_key);
79
	btrfs_release_path(path);
80

81 82 83 84 85
	/*
	 * We are often under a trans handle at this point, so we need to make
	 * sure NOFS is set to keep us from deadlocking.
	 */
	nofs_flag = memalloc_nofs_save();
D
David Sterba 已提交
86
	inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path);
87
	btrfs_release_path(path);
88
	memalloc_nofs_restore(nofs_flag);
89 90 91
	if (IS_ERR(inode))
		return inode;

A
Al Viro 已提交
92
	mapping_set_gfp_mask(inode->i_mapping,
93 94
			mapping_gfp_constraint(inode->i_mapping,
			~(__GFP_FS | __GFP_HIGHMEM)));
95

96 97 98
	return inode;
}

99
struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
100
		struct btrfs_path *path)
101
{
102
	struct btrfs_fs_info *fs_info = block_group->fs_info;
103
	struct inode *inode = NULL;
104
	u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
105 106 107 108 109 110 111 112

	spin_lock(&block_group->lock);
	if (block_group->inode)
		inode = igrab(block_group->inode);
	spin_unlock(&block_group->lock);
	if (inode)
		return inode;

113
	inode = __lookup_free_space_inode(fs_info->tree_root, path,
114
					  block_group->start);
115 116 117
	if (IS_ERR(inode))
		return inode;

118
	spin_lock(&block_group->lock);
119
	if (!((BTRFS_I(inode)->flags & flags) == flags)) {
120
		btrfs_info(fs_info, "Old style space inode found, converting.");
121 122
		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
			BTRFS_INODE_NODATACOW;
123 124 125
		block_group->disk_cache_state = BTRFS_DC_CLEAR;
	}

126
	if (!block_group->iref) {
127 128 129 130 131 132 133 134
		block_group->inode = igrab(inode);
		block_group->iref = 1;
	}
	spin_unlock(&block_group->lock);

	return inode;
}

135 136 137 138
static int __create_free_space_inode(struct btrfs_root *root,
				     struct btrfs_trans_handle *trans,
				     struct btrfs_path *path,
				     u64 ino, u64 offset)
139 140 141 142 143 144
{
	struct btrfs_key key;
	struct btrfs_disk_key disk_key;
	struct btrfs_free_space_header *header;
	struct btrfs_inode_item *inode_item;
	struct extent_buffer *leaf;
145 146 147
	/* We inline CRCs for the free disk space cache */
	const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC |
			  BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
148 149
	int ret;

150
	ret = btrfs_insert_empty_inode(trans, root, path, ino);
151 152 153 154 155 156 157
	if (ret)
		return ret;

	leaf = path->nodes[0];
	inode_item = btrfs_item_ptr(leaf, path->slots[0],
				    struct btrfs_inode_item);
	btrfs_item_key(leaf, &disk_key, path->slots[0]);
158
	memzero_extent_buffer(leaf, (unsigned long)inode_item,
159 160 161 162 163 164 165
			     sizeof(*inode_item));
	btrfs_set_inode_generation(leaf, inode_item, trans->transid);
	btrfs_set_inode_size(leaf, inode_item, 0);
	btrfs_set_inode_nbytes(leaf, inode_item, 0);
	btrfs_set_inode_uid(leaf, inode_item, 0);
	btrfs_set_inode_gid(leaf, inode_item, 0);
	btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
166
	btrfs_set_inode_flags(leaf, inode_item, flags);
167 168
	btrfs_set_inode_nlink(leaf, inode_item, 1);
	btrfs_set_inode_transid(leaf, inode_item, trans->transid);
169
	btrfs_set_inode_block_group(leaf, inode_item, offset);
170
	btrfs_mark_buffer_dirty(leaf);
171
	btrfs_release_path(path);
172 173

	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
174
	key.offset = offset;
175 176 177 178
	key.type = 0;
	ret = btrfs_insert_empty_item(trans, root, path, &key,
				      sizeof(struct btrfs_free_space_header));
	if (ret < 0) {
179
		btrfs_release_path(path);
180 181
		return ret;
	}
182

183 184 185
	leaf = path->nodes[0];
	header = btrfs_item_ptr(leaf, path->slots[0],
				struct btrfs_free_space_header);
186
	memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
187 188
	btrfs_set_free_space_key(leaf, header, &disk_key);
	btrfs_mark_buffer_dirty(leaf);
189
	btrfs_release_path(path);
190 191 192 193

	return 0;
}

194
int create_free_space_inode(struct btrfs_trans_handle *trans,
195
			    struct btrfs_block_group *block_group,
196 197 198 199 200
			    struct btrfs_path *path)
{
	int ret;
	u64 ino;

201
	ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino);
202 203 204
	if (ret < 0)
		return ret;

205
	return __create_free_space_inode(trans->fs_info->tree_root, trans, path,
206
					 ino, block_group->start);
207 208
}

209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267
/*
 * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode
 * handles lookup, otherwise it takes ownership and iputs the inode.
 * Don't reuse an inode pointer after passing it into this function.
 */
int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
				  struct inode *inode,
				  struct btrfs_block_group *block_group)
{
	struct btrfs_path *path;
	struct btrfs_key key;
	int ret = 0;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	if (!inode)
		inode = lookup_free_space_inode(block_group, path);
	if (IS_ERR(inode)) {
		if (PTR_ERR(inode) != -ENOENT)
			ret = PTR_ERR(inode);
		goto out;
	}
	ret = btrfs_orphan_add(trans, BTRFS_I(inode));
	if (ret) {
		btrfs_add_delayed_iput(inode);
		goto out;
	}
	clear_nlink(inode);
	/* One for the block groups ref */
	spin_lock(&block_group->lock);
	if (block_group->iref) {
		block_group->iref = 0;
		block_group->inode = NULL;
		spin_unlock(&block_group->lock);
		iput(inode);
	} else {
		spin_unlock(&block_group->lock);
	}
	/* One for the lookup ref */
	btrfs_add_delayed_iput(inode);

	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
	key.type = 0;
	key.offset = block_group->start;
	ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path,
				-1, 1);
	if (ret) {
		if (ret > 0)
			ret = 0;
		goto out;
	}
	ret = btrfs_del_item(trans, trans->fs_info->tree_root, path);
out:
	btrfs_free_path(path);
	return ret;
}

268
int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
269
				       struct btrfs_block_rsv *rsv)
270
{
271
	u64 needed_bytes;
272
	int ret;
273 274

	/* 1 for slack space, 1 for updating the inode */
275 276
	needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) +
		btrfs_calc_metadata_size(fs_info, 1);
277

278 279 280 281 282 283
	spin_lock(&rsv->lock);
	if (rsv->reserved < needed_bytes)
		ret = -ENOSPC;
	else
		ret = 0;
	spin_unlock(&rsv->lock);
284
	return ret;
285 286
}

287
int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
288
				    struct btrfs_block_group *block_group,
289 290
				    struct inode *inode)
{
291
	struct btrfs_root *root = BTRFS_I(inode)->root;
292
	int ret = 0;
293
	bool locked = false;
294 295

	if (block_group) {
296 297 298 299 300 301
		struct btrfs_path *path = btrfs_alloc_path();

		if (!path) {
			ret = -ENOMEM;
			goto fail;
		}
302
		locked = true;
303 304 305 306
		mutex_lock(&trans->transaction->cache_write_mutex);
		if (!list_empty(&block_group->io_list)) {
			list_del_init(&block_group->io_list);

307
			btrfs_wait_cache_io(trans, block_group, path);
308 309 310 311 312 313 314 315 316 317
			btrfs_put_block_group(block_group);
		}

		/*
		 * now that we've truncated the cache away, its no longer
		 * setup or written
		 */
		spin_lock(&block_group->lock);
		block_group->disk_cache_state = BTRFS_DC_CLEAR;
		spin_unlock(&block_group->lock);
318
		btrfs_free_path(path);
319
	}
320

321
	btrfs_i_size_write(BTRFS_I(inode), 0);
322
	truncate_pagecache(inode, 0);
323 324

	/*
325 326
	 * We skip the throttling logic for free space cache inodes, so we don't
	 * need to check for -EAGAIN.
327
	 */
328
	ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
329
					 0, BTRFS_EXTENT_DATA_KEY);
330 331
	if (ret)
		goto fail;
332

333
	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
334 335

fail:
336 337
	if (locked)
		mutex_unlock(&trans->transaction->cache_write_mutex);
338
	if (ret)
339
		btrfs_abort_transaction(trans, ret);
340

341
	return ret;
342 343
}

344
static void readahead_cache(struct inode *inode)
345 346 347 348 349 350
{
	struct file_ra_state *ra;
	unsigned long last_index;

	ra = kzalloc(sizeof(*ra), GFP_NOFS);
	if (!ra)
351
		return;
352 353

	file_ra_state_init(ra, inode->i_mapping);
354
	last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
355 356 357 358 359 360

	page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);

	kfree(ra);
}

361
static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
362
		       int write)
363
{
364 365
	int num_pages;

366
	num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
367

368
	/* Make sure we can fit our crcs and generation into the first page */
369
	if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
370 371
		return -ENOSPC;

372
	memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
373

374
	io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
375 376
	if (!io_ctl->pages)
		return -ENOMEM;
377 378

	io_ctl->num_pages = num_pages;
379
	io_ctl->fs_info = btrfs_sb(inode->i_sb);
380
	io_ctl->inode = inode;
381

382 383
	return 0;
}
384
ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO);
385

386
static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
387 388
{
	kfree(io_ctl->pages);
389
	io_ctl->pages = NULL;
390 391
}

392
static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
393 394 395 396 397 398 399
{
	if (io_ctl->cur) {
		io_ctl->cur = NULL;
		io_ctl->orig = NULL;
	}
}

400
static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
401
{
402
	ASSERT(io_ctl->index < io_ctl->num_pages);
403
	io_ctl->page = io_ctl->pages[io_ctl->index++];
404
	io_ctl->cur = page_address(io_ctl->page);
405
	io_ctl->orig = io_ctl->cur;
406
	io_ctl->size = PAGE_SIZE;
407
	if (clear)
408
		clear_page(io_ctl->cur);
409 410
}

411
static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
412 413 414 415 416 417
{
	int i;

	io_ctl_unmap_page(io_ctl);

	for (i = 0; i < io_ctl->num_pages; i++) {
418 419 420
		if (io_ctl->pages[i]) {
			ClearPageChecked(io_ctl->pages[i]);
			unlock_page(io_ctl->pages[i]);
421
			put_page(io_ctl->pages[i]);
422
		}
423 424 425
	}
}

426
static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
427 428
{
	struct page *page;
429
	struct inode *inode = io_ctl->inode;
430 431 432 433
	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
	int i;

	for (i = 0; i < io_ctl->num_pages; i++) {
434 435
		int ret;

436 437 438 439 440
		page = find_or_create_page(inode->i_mapping, i, mask);
		if (!page) {
			io_ctl_drop_pages(io_ctl);
			return -ENOMEM;
		}
441 442 443 444 445 446 447 448 449

		ret = set_page_extent_mapped(page);
		if (ret < 0) {
			unlock_page(page);
			put_page(page);
			io_ctl_drop_pages(io_ctl);
			return ret;
		}

450 451 452 453
		io_ctl->pages[i] = page;
		if (uptodate && !PageUptodate(page)) {
			btrfs_readpage(NULL, page);
			lock_page(page);
454 455 456 457 458 459
			if (page->mapping != inode->i_mapping) {
				btrfs_err(BTRFS_I(inode)->root->fs_info,
					  "free space cache page truncated");
				io_ctl_drop_pages(io_ctl);
				return -EIO;
			}
460
			if (!PageUptodate(page)) {
461 462
				btrfs_err(BTRFS_I(inode)->root->fs_info,
					   "error reading free space cache");
463 464 465 466 467 468
				io_ctl_drop_pages(io_ctl);
				return -EIO;
			}
		}
	}

469
	for (i = 0; i < io_ctl->num_pages; i++)
470 471
		clear_page_dirty_for_io(io_ctl->pages[i]);

472 473 474
	return 0;
}

475
static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
476 477 478 479
{
	io_ctl_map_page(io_ctl, 1);

	/*
480 481
	 * Skip the csum areas.  If we don't check crcs then we just have a
	 * 64bit chunk at the front of the first page.
482
	 */
483 484
	io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
	io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
485

486
	put_unaligned_le64(generation, io_ctl->cur);
487 488 489
	io_ctl->cur += sizeof(u64);
}

490
static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
491
{
492
	u64 cache_gen;
493

494 495 496 497
	/*
	 * Skip the crc area.  If we don't check crcs then we just have a 64bit
	 * chunk at the front of the first page.
	 */
498 499
	io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
	io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
500

501 502
	cache_gen = get_unaligned_le64(io_ctl->cur);
	if (cache_gen != generation) {
503
		btrfs_err_rl(io_ctl->fs_info,
504
			"space cache generation (%llu) does not match inode (%llu)",
505
				cache_gen, generation);
506 507 508 509
		io_ctl_unmap_page(io_ctl);
		return -EIO;
	}
	io_ctl->cur += sizeof(u64);
510 511 512
	return 0;
}

513
static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
514 515 516 517 518 519
{
	u32 *tmp;
	u32 crc = ~(u32)0;
	unsigned offset = 0;

	if (index == 0)
520
		offset = sizeof(u32) * io_ctl->num_pages;
521

522 523
	crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
	btrfs_crc32c_final(crc, (u8 *)&crc);
524
	io_ctl_unmap_page(io_ctl);
525
	tmp = page_address(io_ctl->pages[0]);
526 527 528 529
	tmp += index;
	*tmp = crc;
}

530
static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
531 532 533 534 535 536 537 538
{
	u32 *tmp, val;
	u32 crc = ~(u32)0;
	unsigned offset = 0;

	if (index == 0)
		offset = sizeof(u32) * io_ctl->num_pages;

539
	tmp = page_address(io_ctl->pages[0]);
540 541 542 543
	tmp += index;
	val = *tmp;

	io_ctl_map_page(io_ctl, 0);
544 545
	crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
	btrfs_crc32c_final(crc, (u8 *)&crc);
546
	if (val != crc) {
547
		btrfs_err_rl(io_ctl->fs_info,
548
			"csum mismatch on free space cache");
549 550 551 552
		io_ctl_unmap_page(io_ctl);
		return -EIO;
	}

553 554 555
	return 0;
}

556
static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
557 558 559 560 561 562 563 564
			    void *bitmap)
{
	struct btrfs_free_space_entry *entry;

	if (!io_ctl->cur)
		return -ENOSPC;

	entry = io_ctl->cur;
565 566
	put_unaligned_le64(offset, &entry->offset);
	put_unaligned_le64(bytes, &entry->bytes);
567 568 569 570 571 572 573 574
	entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
		BTRFS_FREE_SPACE_EXTENT;
	io_ctl->cur += sizeof(struct btrfs_free_space_entry);
	io_ctl->size -= sizeof(struct btrfs_free_space_entry);

	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
		return 0;

575
	io_ctl_set_crc(io_ctl, io_ctl->index - 1);
576 577 578 579 580 581 582 583 584 585

	/* No more pages to map */
	if (io_ctl->index >= io_ctl->num_pages)
		return 0;

	/* map the next page */
	io_ctl_map_page(io_ctl, 1);
	return 0;
}

586
static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
587 588 589 590 591 592 593 594 595
{
	if (!io_ctl->cur)
		return -ENOSPC;

	/*
	 * If we aren't at the start of the current page, unmap this one and
	 * map the next one if there is any left.
	 */
	if (io_ctl->cur != io_ctl->orig) {
596
		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
597 598 599 600 601
		if (io_ctl->index >= io_ctl->num_pages)
			return -ENOSPC;
		io_ctl_map_page(io_ctl, 0);
	}

602
	copy_page(io_ctl->cur, bitmap);
603
	io_ctl_set_crc(io_ctl, io_ctl->index - 1);
604 605 606 607 608
	if (io_ctl->index < io_ctl->num_pages)
		io_ctl_map_page(io_ctl, 0);
	return 0;
}

609
static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
610
{
611 612 613 614 615 616 617 618
	/*
	 * If we're not on the boundary we know we've modified the page and we
	 * need to crc the page.
	 */
	if (io_ctl->cur != io_ctl->orig)
		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
	else
		io_ctl_unmap_page(io_ctl);
619 620 621

	while (io_ctl->index < io_ctl->num_pages) {
		io_ctl_map_page(io_ctl, 1);
622
		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
623 624 625
	}
}

626
static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
627
			    struct btrfs_free_space *entry, u8 *type)
628 629
{
	struct btrfs_free_space_entry *e;
630 631 632 633 634 635 636
	int ret;

	if (!io_ctl->cur) {
		ret = io_ctl_check_crc(io_ctl, io_ctl->index);
		if (ret)
			return ret;
	}
637 638

	e = io_ctl->cur;
639 640
	entry->offset = get_unaligned_le64(&e->offset);
	entry->bytes = get_unaligned_le64(&e->bytes);
641
	*type = e->type;
642 643 644 645
	io_ctl->cur += sizeof(struct btrfs_free_space_entry);
	io_ctl->size -= sizeof(struct btrfs_free_space_entry);

	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
646
		return 0;
647 648 649

	io_ctl_unmap_page(io_ctl);

650
	return 0;
651 652
}

653
static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
654
			      struct btrfs_free_space *entry)
655
{
656 657 658 659 660 661
	int ret;

	ret = io_ctl_check_crc(io_ctl, io_ctl->index);
	if (ret)
		return ret;

662
	copy_page(entry->bitmap, io_ctl->cur);
663
	io_ctl_unmap_page(io_ctl);
664 665

	return 0;
666 667
}

668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705
static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
{
	struct btrfs_block_group *block_group = ctl->private;
	u64 max_bytes;
	u64 bitmap_bytes;
	u64 extent_bytes;
	u64 size = block_group->length;
	u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
	u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);

	max_bitmaps = max_t(u64, max_bitmaps, 1);

	ASSERT(ctl->total_bitmaps <= max_bitmaps);

	/*
	 * We are trying to keep the total amount of memory used per 1GiB of
	 * space to be MAX_CACHE_BYTES_PER_GIG.  However, with a reclamation
	 * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of
	 * bitmaps, we may end up using more memory than this.
	 */
	if (size < SZ_1G)
		max_bytes = MAX_CACHE_BYTES_PER_GIG;
	else
		max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);

	bitmap_bytes = ctl->total_bitmaps * ctl->unit;

	/*
	 * we want the extent entry threshold to always be at most 1/2 the max
	 * bytes we can have, or whatever is less than that.
	 */
	extent_bytes = max_bytes - bitmap_bytes;
	extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);

	ctl->extents_thresh =
		div_u64(extent_bytes, sizeof(struct btrfs_free_space));
}

706 707 708
static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
				   struct btrfs_free_space_ctl *ctl,
				   struct btrfs_path *path, u64 offset)
709
{
710
	struct btrfs_fs_info *fs_info = root->fs_info;
711 712
	struct btrfs_free_space_header *header;
	struct extent_buffer *leaf;
713
	struct btrfs_io_ctl io_ctl;
714
	struct btrfs_key key;
715
	struct btrfs_free_space *e, *n;
716
	LIST_HEAD(bitmaps);
717 718 719
	u64 num_entries;
	u64 num_bitmaps;
	u64 generation;
720
	u8 type;
721
	int ret = 0;
722 723

	/* Nothing in the space cache, goodbye */
724
	if (!i_size_read(inode))
725
		return 0;
726 727

	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
728
	key.offset = offset;
729 730 731
	key.type = 0;

	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
732
	if (ret < 0)
733
		return 0;
734
	else if (ret > 0) {
735
		btrfs_release_path(path);
736
		return 0;
737 738
	}

739 740
	ret = -1;

741 742 743 744 745 746
	leaf = path->nodes[0];
	header = btrfs_item_ptr(leaf, path->slots[0],
				struct btrfs_free_space_header);
	num_entries = btrfs_free_space_entries(leaf, header);
	num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
	generation = btrfs_free_space_generation(leaf, header);
747
	btrfs_release_path(path);
748

749
	if (!BTRFS_I(inode)->generation) {
750
		btrfs_info(fs_info,
751
			   "the free space cache file (%llu) is invalid, skip it",
752 753 754 755
			   offset);
		return 0;
	}

756
	if (BTRFS_I(inode)->generation != generation) {
757 758 759
		btrfs_err(fs_info,
			  "free space inode generation (%llu) did not match free space cache generation (%llu)",
			  BTRFS_I(inode)->generation, generation);
760
		return 0;
761 762 763
	}

	if (!num_entries)
764
		return 0;
765

766
	ret = io_ctl_init(&io_ctl, inode, 0);
767 768 769
	if (ret)
		return ret;

770
	readahead_cache(inode);
771

772
	ret = io_ctl_prepare_pages(&io_ctl, true);
773 774
	if (ret)
		goto out;
775

776 777 778 779
	ret = io_ctl_check_crc(&io_ctl, 0);
	if (ret)
		goto free_cache;

780 781 782
	ret = io_ctl_check_generation(&io_ctl, generation);
	if (ret)
		goto free_cache;
783

784 785 786
	while (num_entries) {
		e = kmem_cache_zalloc(btrfs_free_space_cachep,
				      GFP_NOFS);
787 788
		if (!e) {
			ret = -ENOMEM;
789
			goto free_cache;
790
		}
791

792 793 794 795 796 797
		ret = io_ctl_read_entry(&io_ctl, e, &type);
		if (ret) {
			kmem_cache_free(btrfs_free_space_cachep, e);
			goto free_cache;
		}

798
		if (!e->bytes) {
799
			ret = -1;
800 801
			kmem_cache_free(btrfs_free_space_cachep, e);
			goto free_cache;
802
		}
803 804 805 806 807 808

		if (type == BTRFS_FREE_SPACE_EXTENT) {
			spin_lock(&ctl->tree_lock);
			ret = link_free_space(ctl, e);
			spin_unlock(&ctl->tree_lock);
			if (ret) {
809
				btrfs_err(fs_info,
810
					"Duplicate entries in free space cache, dumping");
811
				kmem_cache_free(btrfs_free_space_cachep, e);
812 813
				goto free_cache;
			}
814
		} else {
815
			ASSERT(num_bitmaps);
816
			num_bitmaps--;
817 818
			e->bitmap = kmem_cache_zalloc(
					btrfs_free_space_bitmap_cachep, GFP_NOFS);
819
			if (!e->bitmap) {
820
				ret = -ENOMEM;
821 822
				kmem_cache_free(
					btrfs_free_space_cachep, e);
823 824
				goto free_cache;
			}
825 826 827
			spin_lock(&ctl->tree_lock);
			ret = link_free_space(ctl, e);
			ctl->total_bitmaps++;
828
			recalculate_thresholds(ctl);
829 830
			spin_unlock(&ctl->tree_lock);
			if (ret) {
831
				btrfs_err(fs_info,
832
					"Duplicate entries in free space cache, dumping");
833
				kmem_cache_free(btrfs_free_space_cachep, e);
834 835
				goto free_cache;
			}
836
			list_add_tail(&e->list, &bitmaps);
837 838
		}

839 840
		num_entries--;
	}
841

842 843
	io_ctl_unmap_page(&io_ctl);

844 845 846 847 848
	/*
	 * We add the bitmaps at the end of the entries in order that
	 * the bitmap entries are added to the cache.
	 */
	list_for_each_entry_safe(e, n, &bitmaps, list) {
849
		list_del_init(&e->list);
850 851 852
		ret = io_ctl_read_bitmap(&io_ctl, e);
		if (ret)
			goto free_cache;
853 854
	}

855
	io_ctl_drop_pages(&io_ctl);
856 857
	ret = 1;
out:
858
	io_ctl_free(&io_ctl);
859 860
	return ret;
free_cache:
861
	io_ctl_drop_pages(&io_ctl);
862
	__btrfs_remove_free_space_cache(ctl);
863 864 865
	goto out;
}

866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900
static int copy_free_space_cache(struct btrfs_block_group *block_group,
				 struct btrfs_free_space_ctl *ctl)
{
	struct btrfs_free_space *info;
	struct rb_node *n;
	int ret = 0;

	while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) {
		info = rb_entry(n, struct btrfs_free_space, offset_index);
		if (!info->bitmap) {
			unlink_free_space(ctl, info);
			ret = btrfs_add_free_space(block_group, info->offset,
						   info->bytes);
			kmem_cache_free(btrfs_free_space_cachep, info);
		} else {
			u64 offset = info->offset;
			u64 bytes = ctl->unit;

			while (search_bitmap(ctl, info, &offset, &bytes,
					     false) == 0) {
				ret = btrfs_add_free_space(block_group, offset,
							   bytes);
				if (ret)
					break;
				bitmap_clear_bits(ctl, info, offset, bytes);
				offset = info->offset;
				bytes = ctl->unit;
			}
			free_bitmap(ctl, info);
		}
		cond_resched();
	}
	return ret;
}

901
int load_free_space_cache(struct btrfs_block_group *block_group)
J
Josef Bacik 已提交
902
{
903
	struct btrfs_fs_info *fs_info = block_group->fs_info;
904
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
905
	struct btrfs_free_space_ctl tmp_ctl = {};
906 907
	struct inode *inode;
	struct btrfs_path *path;
908
	int ret = 0;
909
	bool matched;
910
	u64 used = block_group->used;
911

912 913 914 915 916 917 918
	/*
	 * Because we could potentially discard our loaded free space, we want
	 * to load everything into a temporary structure first, and then if it's
	 * valid copy it all into the actual free space ctl.
	 */
	btrfs_init_free_space_ctl(block_group, &tmp_ctl);

919 920 921 922
	/*
	 * If this block group has been marked to be cleared for one reason or
	 * another then we can't trust the on disk cache, so just return.
	 */
923
	spin_lock(&block_group->lock);
924 925 926 927
	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
		spin_unlock(&block_group->lock);
		return 0;
	}
928
	spin_unlock(&block_group->lock);
929 930 931 932

	path = btrfs_alloc_path();
	if (!path)
		return 0;
933 934
	path->search_commit_root = 1;
	path->skip_locking = 1;
935

936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954
	/*
	 * We must pass a path with search_commit_root set to btrfs_iget in
	 * order to avoid a deadlock when allocating extents for the tree root.
	 *
	 * When we are COWing an extent buffer from the tree root, when looking
	 * for a free extent, at extent-tree.c:find_free_extent(), we can find
	 * block group without its free space cache loaded. When we find one
	 * we must load its space cache which requires reading its free space
	 * cache's inode item from the root tree. If this inode item is located
	 * in the same leaf that we started COWing before, then we end up in
	 * deadlock on the extent buffer (trying to read lock it when we
	 * previously write locked it).
	 *
	 * It's safe to read the inode item using the commit root because
	 * block groups, once loaded, stay in memory forever (until they are
	 * removed) as well as their space caches once loaded. New block groups
	 * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
	 * we will never try to read their inode item while the fs is mounted.
	 */
955
	inode = lookup_free_space_inode(block_group, path);
956 957 958 959 960
	if (IS_ERR(inode)) {
		btrfs_free_path(path);
		return 0;
	}

961 962 963 964
	/* We may have converted the inode and made the cache invalid. */
	spin_lock(&block_group->lock);
	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
		spin_unlock(&block_group->lock);
965
		btrfs_free_path(path);
966 967 968 969
		goto out;
	}
	spin_unlock(&block_group->lock);

970
	ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl,
971
				      path, block_group->start);
972 973 974 975
	btrfs_free_path(path);
	if (ret <= 0)
		goto out;

976 977
	matched = (tmp_ctl.free_space == (block_group->length - used -
					  block_group->bytes_super));
978

979 980 981 982 983 984 985 986 987 988
	if (matched) {
		ret = copy_free_space_cache(block_group, &tmp_ctl);
		/*
		 * ret == 1 means we successfully loaded the free space cache,
		 * so we need to re-set it here.
		 */
		if (ret == 0)
			ret = 1;
	} else {
		__btrfs_remove_free_space_cache(&tmp_ctl);
J
Jeff Mahoney 已提交
989 990
		btrfs_warn(fs_info,
			   "block group %llu has wrong amount of free space",
991
			   block_group->start);
992 993 994 995 996 997 998 999
		ret = -1;
	}
out:
	if (ret < 0) {
		/* This cache is bogus, make sure it gets cleared */
		spin_lock(&block_group->lock);
		block_group->disk_cache_state = BTRFS_DC_CLEAR;
		spin_unlock(&block_group->lock);
1000
		ret = 0;
1001

J
Jeff Mahoney 已提交
1002 1003
		btrfs_warn(fs_info,
			   "failed to load free space cache for block group %llu, rebuilding it now",
1004
			   block_group->start);
1005 1006
	}

1007 1008 1009
	spin_lock(&ctl->tree_lock);
	btrfs_discard_update_discardable(block_group);
	spin_unlock(&ctl->tree_lock);
1010 1011
	iput(inode);
	return ret;
1012 1013
}

1014
static noinline_for_stack
1015
int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
1016
			      struct btrfs_free_space_ctl *ctl,
1017
			      struct btrfs_block_group *block_group,
1018 1019
			      int *entries, int *bitmaps,
			      struct list_head *bitmap_list)
J
Josef Bacik 已提交
1020
{
1021
	int ret;
1022
	struct btrfs_free_cluster *cluster = NULL;
1023
	struct btrfs_free_cluster *cluster_locked = NULL;
1024
	struct rb_node *node = rb_first(&ctl->free_space_offset);
1025
	struct btrfs_trim_range *trim_entry;
1026

1027
	/* Get the cluster for this block_group if it exists */
1028
	if (block_group && !list_empty(&block_group->cluster_list)) {
1029 1030 1031
		cluster = list_entry(block_group->cluster_list.next,
				     struct btrfs_free_cluster,
				     block_group_list);
1032
	}
1033

1034
	if (!node && cluster) {
1035 1036
		cluster_locked = cluster;
		spin_lock(&cluster_locked->lock);
1037 1038 1039 1040
		node = rb_first(&cluster->root);
		cluster = NULL;
	}

1041 1042 1043
	/* Write out the extent entries */
	while (node) {
		struct btrfs_free_space *e;
J
Josef Bacik 已提交
1044

1045
		e = rb_entry(node, struct btrfs_free_space, offset_index);
1046
		*entries += 1;
J
Josef Bacik 已提交
1047

1048
		ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
1049 1050
				       e->bitmap);
		if (ret)
1051
			goto fail;
1052

1053
		if (e->bitmap) {
1054 1055
			list_add_tail(&e->list, bitmap_list);
			*bitmaps += 1;
1056
		}
1057 1058 1059
		node = rb_next(node);
		if (!node && cluster) {
			node = rb_first(&cluster->root);
1060 1061
			cluster_locked = cluster;
			spin_lock(&cluster_locked->lock);
1062
			cluster = NULL;
1063
		}
1064
	}
1065 1066 1067 1068
	if (cluster_locked) {
		spin_unlock(&cluster_locked->lock);
		cluster_locked = NULL;
	}
1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083

	/*
	 * Make sure we don't miss any range that was removed from our rbtree
	 * because trimming is running. Otherwise after a umount+mount (or crash
	 * after committing the transaction) we would leak free space and get
	 * an inconsistent free space cache report from fsck.
	 */
	list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
		ret = io_ctl_add_entry(io_ctl, trim_entry->start,
				       trim_entry->bytes, NULL);
		if (ret)
			goto fail;
		*entries += 1;
	}

1084 1085
	return 0;
fail:
1086 1087
	if (cluster_locked)
		spin_unlock(&cluster_locked->lock);
1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109
	return -ENOSPC;
}

static noinline_for_stack int
update_cache_item(struct btrfs_trans_handle *trans,
		  struct btrfs_root *root,
		  struct inode *inode,
		  struct btrfs_path *path, u64 offset,
		  int entries, int bitmaps)
{
	struct btrfs_key key;
	struct btrfs_free_space_header *header;
	struct extent_buffer *leaf;
	int ret;

	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
	key.offset = offset;
	key.type = 0;

	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
	if (ret < 0) {
		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1110
				 EXTENT_DELALLOC, 0, 0, NULL);
1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121
		goto fail;
	}
	leaf = path->nodes[0];
	if (ret > 0) {
		struct btrfs_key found_key;
		ASSERT(path->slots[0]);
		path->slots[0]--;
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
		if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
		    found_key.offset != offset) {
			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
1122 1123
					 inode->i_size - 1, EXTENT_DELALLOC, 0,
					 0, NULL);
1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143
			btrfs_release_path(path);
			goto fail;
		}
	}

	BTRFS_I(inode)->generation = trans->transid;
	header = btrfs_item_ptr(leaf, path->slots[0],
				struct btrfs_free_space_header);
	btrfs_set_free_space_entries(leaf, header, entries);
	btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
	btrfs_set_free_space_generation(leaf, header, trans->transid);
	btrfs_mark_buffer_dirty(leaf);
	btrfs_release_path(path);

	return 0;

fail:
	return -1;
}

1144
static noinline_for_stack int write_pinned_extent_entries(
1145
			    struct btrfs_trans_handle *trans,
1146
			    struct btrfs_block_group *block_group,
1147
			    struct btrfs_io_ctl *io_ctl,
1148
			    int *entries)
1149 1150 1151 1152
{
	u64 start, extent_start, extent_end, len;
	struct extent_io_tree *unpin = NULL;
	int ret;
1153

1154 1155 1156
	if (!block_group)
		return 0;

1157 1158 1159
	/*
	 * We want to add any pinned extents to our free space cache
	 * so we don't leak the space
1160
	 *
1161 1162 1163
	 * We shouldn't have switched the pinned extents yet so this is the
	 * right one
	 */
1164
	unpin = &trans->transaction->pinned_extents;
1165

1166
	start = block_group->start;
1167

1168
	while (start < block_group->start + block_group->length) {
1169 1170
		ret = find_first_extent_bit(unpin, start,
					    &extent_start, &extent_end,
1171
					    EXTENT_DIRTY, NULL);
1172 1173
		if (ret)
			return 0;
J
Josef Bacik 已提交
1174

1175
		/* This pinned extent is out of our range */
1176
		if (extent_start >= block_group->start + block_group->length)
1177
			return 0;
1178

1179
		extent_start = max(extent_start, start);
1180 1181
		extent_end = min(block_group->start + block_group->length,
				 extent_end + 1);
1182
		len = extent_end - extent_start;
J
Josef Bacik 已提交
1183

1184 1185
		*entries += 1;
		ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
1186
		if (ret)
1187
			return -ENOSPC;
J
Josef Bacik 已提交
1188

1189
		start = extent_end;
1190
	}
J
Josef Bacik 已提交
1191

1192 1193 1194 1195
	return 0;
}

static noinline_for_stack int
1196
write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
1197
{
1198
	struct btrfs_free_space *entry, *next;
1199 1200
	int ret;

J
Josef Bacik 已提交
1201
	/* Write out the bitmaps */
1202
	list_for_each_entry_safe(entry, next, bitmap_list, list) {
1203
		ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
1204
		if (ret)
1205
			return -ENOSPC;
J
Josef Bacik 已提交
1206
		list_del_init(&entry->list);
1207 1208
	}

1209 1210
	return 0;
}
J
Josef Bacik 已提交
1211

1212 1213 1214
static int flush_dirty_cache(struct inode *inode)
{
	int ret;
1215

1216
	ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
1217
	if (ret)
1218
		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1219
				 EXTENT_DELALLOC, 0, 0, NULL);
J
Josef Bacik 已提交
1220

1221
	return ret;
1222 1223 1224
}

static void noinline_for_stack
1225
cleanup_bitmap_list(struct list_head *bitmap_list)
1226
{
1227
	struct btrfs_free_space *entry, *next;
1228

1229
	list_for_each_entry_safe(entry, next, bitmap_list, list)
1230
		list_del_init(&entry->list);
1231 1232 1233 1234 1235
}

static void noinline_for_stack
cleanup_write_cache_enospc(struct inode *inode,
			   struct btrfs_io_ctl *io_ctl,
1236
			   struct extent_state **cached_state)
1237
{
1238 1239
	io_ctl_drop_pages(io_ctl);
	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1240
			     i_size_read(inode) - 1, cached_state);
1241
}
1242

1243 1244
static int __btrfs_wait_cache_io(struct btrfs_root *root,
				 struct btrfs_trans_handle *trans,
1245
				 struct btrfs_block_group *block_group,
1246 1247
				 struct btrfs_io_ctl *io_ctl,
				 struct btrfs_path *path, u64 offset)
1248 1249 1250 1251
{
	int ret;
	struct inode *inode = io_ctl->inode;

1252 1253 1254
	if (!inode)
		return 0;

1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266
	/* Flush the dirty pages in the cache file. */
	ret = flush_dirty_cache(inode);
	if (ret)
		goto out;

	/* Update the cache item to tell everyone this cache file is valid. */
	ret = update_cache_item(trans, root, inode, path, offset,
				io_ctl->entries, io_ctl->bitmaps);
out:
	if (ret) {
		invalidate_inode_pages2(inode->i_mapping);
		BTRFS_I(inode)->generation = 0;
1267 1268
		if (block_group)
			btrfs_debug(root->fs_info,
1269 1270
	  "failed to write free space cache for block group %llu error %d",
				  block_group->start, ret);
1271
	}
1272
	btrfs_update_inode(trans, root, BTRFS_I(inode));
1273 1274

	if (block_group) {
1275 1276 1277 1278
		/* the dirty list is protected by the dirty_bgs_lock */
		spin_lock(&trans->transaction->dirty_bgs_lock);

		/* the disk_cache_state is protected by the block group lock */
1279 1280 1281 1282
		spin_lock(&block_group->lock);

		/*
		 * only mark this as written if we didn't get put back on
1283 1284
		 * the dirty list while waiting for IO.   Otherwise our
		 * cache state won't be right, and we won't get written again
1285 1286 1287 1288 1289 1290 1291
		 */
		if (!ret && list_empty(&block_group->dirty_list))
			block_group->disk_cache_state = BTRFS_DC_WRITTEN;
		else if (ret)
			block_group->disk_cache_state = BTRFS_DC_ERROR;

		spin_unlock(&block_group->lock);
1292
		spin_unlock(&trans->transaction->dirty_bgs_lock);
1293 1294 1295 1296 1297 1298 1299 1300
		io_ctl->inode = NULL;
		iput(inode);
	}

	return ret;

}

1301
int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
1302
			struct btrfs_block_group *block_group,
1303 1304 1305 1306
			struct btrfs_path *path)
{
	return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
				     block_group, &block_group->io_ctl,
1307
				     path, block_group->start);
1308 1309
}

1310
/**
1311 1312 1313 1314 1315 1316 1317 1318
 * Write out cached info to an inode
 *
 * @root:        root the inode belongs to
 * @inode:       freespace inode we are writing out
 * @ctl:         free space cache we are going to write out
 * @block_group: block_group for this cache if it belongs to a block_group
 * @io_ctl:      holds context for the io
 * @trans:       the trans handle
1319 1320
 *
 * This function writes out a free space cache struct to disk for quick recovery
G
Geliang Tang 已提交
1321
 * on mount.  This will return 0 if it was successful in writing the cache out,
1322
 * or an errno if it was not.
1323 1324 1325
 */
static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
				   struct btrfs_free_space_ctl *ctl,
1326
				   struct btrfs_block_group *block_group,
1327
				   struct btrfs_io_ctl *io_ctl,
1328
				   struct btrfs_trans_handle *trans)
1329 1330
{
	struct extent_state *cached_state = NULL;
1331
	LIST_HEAD(bitmap_list);
1332 1333 1334
	int entries = 0;
	int bitmaps = 0;
	int ret;
1335
	int must_iput = 0;
1336 1337

	if (!i_size_read(inode))
1338
		return -EIO;
1339

1340
	WARN_ON(io_ctl->pages);
1341
	ret = io_ctl_init(io_ctl, inode, 1);
1342
	if (ret)
1343
		return ret;
1344

1345 1346 1347 1348 1349 1350 1351 1352 1353
	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
		down_write(&block_group->data_rwsem);
		spin_lock(&block_group->lock);
		if (block_group->delalloc_bytes) {
			block_group->disk_cache_state = BTRFS_DC_WRITTEN;
			spin_unlock(&block_group->lock);
			up_write(&block_group->data_rwsem);
			BTRFS_I(inode)->generation = 0;
			ret = 0;
1354
			must_iput = 1;
1355 1356 1357 1358 1359
			goto out;
		}
		spin_unlock(&block_group->lock);
	}

1360
	/* Lock all pages first so we can lock the extent safely. */
1361
	ret = io_ctl_prepare_pages(io_ctl, false);
1362
	if (ret)
1363
		goto out_unlock;
1364 1365

	lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1366
			 &cached_state);
1367

1368
	io_ctl_set_generation(io_ctl, trans->transid);
1369

1370
	mutex_lock(&ctl->cache_writeout_mutex);
1371
	/* Write out the extent entries in the free space cache */
1372
	spin_lock(&ctl->tree_lock);
1373
	ret = write_cache_extent_entries(io_ctl, ctl,
1374 1375
					 block_group, &entries, &bitmaps,
					 &bitmap_list);
1376 1377
	if (ret)
		goto out_nospc_locked;
1378

1379 1380 1381 1382
	/*
	 * Some spaces that are freed in the current transaction are pinned,
	 * they will be added into free space cache after the transaction is
	 * committed, we shouldn't lose them.
1383 1384 1385
	 *
	 * If this changes while we are working we'll get added back to
	 * the dirty list and redo it.  No locking needed
1386
	 */
1387
	ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries);
1388 1389
	if (ret)
		goto out_nospc_locked;
1390

1391 1392 1393 1394 1395
	/*
	 * At last, we write out all the bitmaps and keep cache_writeout_mutex
	 * locked while doing it because a concurrent trim can be manipulating
	 * or freeing the bitmap.
	 */
1396
	ret = write_bitmap_entries(io_ctl, &bitmap_list);
1397
	spin_unlock(&ctl->tree_lock);
1398
	mutex_unlock(&ctl->cache_writeout_mutex);
1399 1400 1401 1402
	if (ret)
		goto out_nospc;

	/* Zero out the rest of the pages just to make sure */
1403
	io_ctl_zero_remaining_pages(io_ctl);
1404

1405
	/* Everything is written out, now we dirty the pages in the file. */
1406 1407
	ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages,
				io_ctl->num_pages, 0, i_size_read(inode),
1408
				&cached_state, false);
1409
	if (ret)
1410
		goto out_nospc;
1411

1412 1413
	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
		up_write(&block_group->data_rwsem);
1414 1415 1416 1417
	/*
	 * Release the pages and unlock the extent, we will flush
	 * them out later
	 */
1418
	io_ctl_drop_pages(io_ctl);
1419
	io_ctl_free(io_ctl);
1420 1421

	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1422
			     i_size_read(inode) - 1, &cached_state);
1423

1424 1425
	/*
	 * at this point the pages are under IO and we're happy,
1426
	 * The caller is responsible for waiting on them and updating
1427 1428 1429 1430 1431 1432
	 * the cache and the inode
	 */
	io_ctl->entries = entries;
	io_ctl->bitmaps = bitmaps;

	ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
1433
	if (ret)
1434 1435
		goto out;

1436 1437
	return 0;

1438 1439 1440 1441 1442
out_nospc_locked:
	cleanup_bitmap_list(&bitmap_list);
	spin_unlock(&ctl->tree_lock);
	mutex_unlock(&ctl->cache_writeout_mutex);

1443
out_nospc:
1444
	cleanup_write_cache_enospc(inode, io_ctl, &cached_state);
1445

1446
out_unlock:
1447 1448 1449
	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
		up_write(&block_group->data_rwsem);

1450 1451 1452 1453 1454 1455 1456
out:
	io_ctl->inode = NULL;
	io_ctl_free(io_ctl);
	if (ret) {
		invalidate_inode_pages2(inode->i_mapping);
		BTRFS_I(inode)->generation = 0;
	}
1457
	btrfs_update_inode(trans, root, BTRFS_I(inode));
1458 1459 1460
	if (must_iput)
		iput(inode);
	return ret;
1461 1462
}

1463
int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
1464
			  struct btrfs_block_group *block_group,
1465 1466
			  struct btrfs_path *path)
{
1467
	struct btrfs_fs_info *fs_info = trans->fs_info;
1468 1469 1470 1471 1472 1473 1474
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
	struct inode *inode;
	int ret = 0;

	spin_lock(&block_group->lock);
	if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
		spin_unlock(&block_group->lock);
1475 1476
		return 0;
	}
1477 1478
	spin_unlock(&block_group->lock);

1479
	inode = lookup_free_space_inode(block_group, path);
1480 1481 1482
	if (IS_ERR(inode))
		return 0;

1483 1484
	ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
				block_group, &block_group->io_ctl, trans);
1485
	if (ret) {
1486
		btrfs_debug(fs_info,
1487 1488
	  "failed to write free space cache for block group %llu error %d",
			  block_group->start, ret);
1489 1490 1491 1492 1493 1494
		spin_lock(&block_group->lock);
		block_group->disk_cache_state = BTRFS_DC_ERROR;
		spin_unlock(&block_group->lock);

		block_group->io_ctl.inode = NULL;
		iput(inode);
1495 1496
	}

1497 1498 1499 1500 1501
	/*
	 * if ret == 0 the caller is expected to call btrfs_wait_cache_io
	 * to wait for IO and put the inode
	 */

J
Josef Bacik 已提交
1502 1503 1504
	return ret;
}

1505
static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
1506
					  u64 offset)
J
Josef Bacik 已提交
1507
{
1508
	ASSERT(offset >= bitmap_start);
1509
	offset -= bitmap_start;
1510
	return (unsigned long)(div_u64(offset, unit));
1511
}
J
Josef Bacik 已提交
1512

1513
static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
1514
{
1515
	return (unsigned long)(div_u64(bytes, unit));
1516
}
J
Josef Bacik 已提交
1517

1518
static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
1519 1520 1521
				   u64 offset)
{
	u64 bitmap_start;
1522
	u64 bytes_per_bitmap;
J
Josef Bacik 已提交
1523

1524 1525
	bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
	bitmap_start = offset - ctl->start;
1526
	bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
1527
	bitmap_start *= bytes_per_bitmap;
1528
	bitmap_start += ctl->start;
J
Josef Bacik 已提交
1529

1530
	return bitmap_start;
J
Josef Bacik 已提交
1531 1532
}

1533 1534
static int tree_insert_offset(struct rb_root *root, u64 offset,
			      struct rb_node *node, int bitmap)
J
Josef Bacik 已提交
1535 1536 1537 1538 1539 1540 1541
{
	struct rb_node **p = &root->rb_node;
	struct rb_node *parent = NULL;
	struct btrfs_free_space *info;

	while (*p) {
		parent = *p;
1542
		info = rb_entry(parent, struct btrfs_free_space, offset_index);
J
Josef Bacik 已提交
1543

1544
		if (offset < info->offset) {
J
Josef Bacik 已提交
1545
			p = &(*p)->rb_left;
1546
		} else if (offset > info->offset) {
J
Josef Bacik 已提交
1547
			p = &(*p)->rb_right;
1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562
		} else {
			/*
			 * we could have a bitmap entry and an extent entry
			 * share the same offset.  If this is the case, we want
			 * the extent entry to always be found first if we do a
			 * linear search through the tree, since we want to have
			 * the quickest allocation time, and allocating from an
			 * extent is faster than allocating from a bitmap.  So
			 * if we're inserting a bitmap and we find an entry at
			 * this offset, we want to go right, or after this entry
			 * logically.  If we are inserting an extent and we've
			 * found a bitmap, we want to go left, or before
			 * logically.
			 */
			if (bitmap) {
1563 1564 1565 1566
				if (info->bitmap) {
					WARN_ON_ONCE(1);
					return -EEXIST;
				}
1567 1568
				p = &(*p)->rb_right;
			} else {
1569 1570 1571 1572
				if (!info->bitmap) {
					WARN_ON_ONCE(1);
					return -EEXIST;
				}
1573 1574 1575
				p = &(*p)->rb_left;
			}
		}
J
Josef Bacik 已提交
1576 1577 1578 1579 1580 1581 1582 1583 1584
	}

	rb_link_node(node, parent, p);
	rb_insert_color(node, root);

	return 0;
}

/*
J
Josef Bacik 已提交
1585 1586
 * searches the tree for the given offset.
 *
1587 1588 1589
 * fuzzy - If this is set, then we are trying to make an allocation, and we just
 * want a section that has at least bytes size and comes at or after the given
 * offset.
J
Josef Bacik 已提交
1590
 */
1591
static struct btrfs_free_space *
1592
tree_search_offset(struct btrfs_free_space_ctl *ctl,
1593
		   u64 offset, int bitmap_only, int fuzzy)
J
Josef Bacik 已提交
1594
{
1595
	struct rb_node *n = ctl->free_space_offset.rb_node;
1596 1597 1598 1599 1600 1601 1602 1603
	struct btrfs_free_space *entry, *prev = NULL;

	/* find entry that is closest to the 'offset' */
	while (1) {
		if (!n) {
			entry = NULL;
			break;
		}
J
Josef Bacik 已提交
1604 1605

		entry = rb_entry(n, struct btrfs_free_space, offset_index);
1606
		prev = entry;
J
Josef Bacik 已提交
1607

1608
		if (offset < entry->offset)
J
Josef Bacik 已提交
1609
			n = n->rb_left;
1610
		else if (offset > entry->offset)
J
Josef Bacik 已提交
1611
			n = n->rb_right;
1612
		else
J
Josef Bacik 已提交
1613 1614 1615
			break;
	}

1616 1617 1618 1619 1620
	if (bitmap_only) {
		if (!entry)
			return NULL;
		if (entry->bitmap)
			return entry;
J
Josef Bacik 已提交
1621

1622 1623 1624 1625 1626 1627 1628 1629 1630 1631
		/*
		 * bitmap entry and extent entry may share same offset,
		 * in that case, bitmap entry comes after extent entry.
		 */
		n = rb_next(n);
		if (!n)
			return NULL;
		entry = rb_entry(n, struct btrfs_free_space, offset_index);
		if (entry->offset != offset)
			return NULL;
J
Josef Bacik 已提交
1632

1633 1634 1635 1636
		WARN_ON(!entry->bitmap);
		return entry;
	} else if (entry) {
		if (entry->bitmap) {
J
Josef Bacik 已提交
1637
			/*
1638 1639
			 * if previous extent entry covers the offset,
			 * we should return it instead of the bitmap entry
J
Josef Bacik 已提交
1640
			 */
1641 1642
			n = rb_prev(&entry->offset_index);
			if (n) {
1643 1644
				prev = rb_entry(n, struct btrfs_free_space,
						offset_index);
1645 1646 1647
				if (!prev->bitmap &&
				    prev->offset + prev->bytes > offset)
					entry = prev;
J
Josef Bacik 已提交
1648
			}
1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662
		}
		return entry;
	}

	if (!prev)
		return NULL;

	/* find last entry before the 'offset' */
	entry = prev;
	if (entry->offset > offset) {
		n = rb_prev(&entry->offset_index);
		if (n) {
			entry = rb_entry(n, struct btrfs_free_space,
					offset_index);
1663
			ASSERT(entry->offset <= offset);
J
Josef Bacik 已提交
1664
		} else {
1665 1666 1667 1668
			if (fuzzy)
				return entry;
			else
				return NULL;
J
Josef Bacik 已提交
1669 1670 1671
		}
	}

1672
	if (entry->bitmap) {
1673 1674
		n = rb_prev(&entry->offset_index);
		if (n) {
1675 1676
			prev = rb_entry(n, struct btrfs_free_space,
					offset_index);
1677 1678 1679
			if (!prev->bitmap &&
			    prev->offset + prev->bytes > offset)
				return prev;
1680
		}
1681
		if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1682 1683 1684 1685 1686 1687 1688 1689 1690 1691
			return entry;
	} else if (entry->offset + entry->bytes > offset)
		return entry;

	if (!fuzzy)
		return NULL;

	while (1) {
		if (entry->bitmap) {
			if (entry->offset + BITS_PER_BITMAP *
1692
			    ctl->unit > offset)
1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704
				break;
		} else {
			if (entry->offset + entry->bytes > offset)
				break;
		}

		n = rb_next(&entry->offset_index);
		if (!n)
			return NULL;
		entry = rb_entry(n, struct btrfs_free_space, offset_index);
	}
	return entry;
J
Josef Bacik 已提交
1705 1706
}

1707
static inline void
1708
__unlink_free_space(struct btrfs_free_space_ctl *ctl,
1709
		    struct btrfs_free_space *info)
J
Josef Bacik 已提交
1710
{
1711 1712
	rb_erase(&info->offset_index, &ctl->free_space_offset);
	ctl->free_extents--;
1713

1714
	if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
1715
		ctl->discardable_extents[BTRFS_STAT_CURR]--;
1716 1717
		ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes;
	}
1718 1719
}

1720
static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
1721 1722
			      struct btrfs_free_space *info)
{
1723 1724
	__unlink_free_space(ctl, info);
	ctl->free_space -= info->bytes;
J
Josef Bacik 已提交
1725 1726
}

1727
static int link_free_space(struct btrfs_free_space_ctl *ctl,
J
Josef Bacik 已提交
1728 1729 1730 1731
			   struct btrfs_free_space *info)
{
	int ret = 0;

1732
	ASSERT(info->bytes || info->bitmap);
1733
	ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
1734
				 &info->offset_index, (info->bitmap != NULL));
J
Josef Bacik 已提交
1735 1736 1737
	if (ret)
		return ret;

1738
	if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
1739
		ctl->discardable_extents[BTRFS_STAT_CURR]++;
1740 1741
		ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
	}
1742

1743 1744
	ctl->free_space += info->bytes;
	ctl->free_extents++;
1745 1746 1747
	return ret;
}

1748 1749 1750
static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
				       struct btrfs_free_space *info,
				       u64 offset, u64 bytes)
1751
{
1752 1753
	unsigned long start, count, end;
	int extent_delta = -1;
1754

1755 1756
	start = offset_to_bit(info->offset, ctl->unit, offset);
	count = bytes_to_bits(bytes, ctl->unit);
1757 1758
	end = start + count;
	ASSERT(end <= BITS_PER_BITMAP);
1759

L
Li Zefan 已提交
1760
	bitmap_clear(info->bitmap, start, count);
1761 1762

	info->bytes -= bytes;
1763 1764
	if (info->max_extent_size > ctl->unit)
		info->max_extent_size = 0;
1765 1766 1767 1768 1769 1770 1771 1772

	if (start && test_bit(start - 1, info->bitmap))
		extent_delta++;

	if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
		extent_delta++;

	info->bitmap_extents += extent_delta;
1773
	if (!btrfs_free_space_trimmed(info)) {
1774
		ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
1775 1776
		ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
	}
1777 1778 1779 1780 1781 1782 1783
}

static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
			      struct btrfs_free_space *info, u64 offset,
			      u64 bytes)
{
	__bitmap_clear_bits(ctl, info, offset, bytes);
1784
	ctl->free_space -= bytes;
1785 1786
}

1787
static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
J
Josef Bacik 已提交
1788 1789
			    struct btrfs_free_space *info, u64 offset,
			    u64 bytes)
1790
{
1791 1792
	unsigned long start, count, end;
	int extent_delta = 1;
1793

1794 1795
	start = offset_to_bit(info->offset, ctl->unit, offset);
	count = bytes_to_bits(bytes, ctl->unit);
1796 1797
	end = start + count;
	ASSERT(end <= BITS_PER_BITMAP);
1798

L
Li Zefan 已提交
1799
	bitmap_set(info->bitmap, start, count);
1800 1801

	info->bytes += bytes;
1802
	ctl->free_space += bytes;
1803 1804 1805 1806 1807 1808 1809 1810

	if (start && test_bit(start - 1, info->bitmap))
		extent_delta--;

	if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
		extent_delta--;

	info->bitmap_extents += extent_delta;
1811
	if (!btrfs_free_space_trimmed(info)) {
1812
		ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
1813 1814
		ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes;
	}
1815 1816
}

1817 1818 1819 1820
/*
 * If we can not find suitable extent, we will use bytes to record
 * the size of the max extent.
 */
1821
static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1822
			 struct btrfs_free_space *bitmap_info, u64 *offset,
1823
			 u64 *bytes, bool for_alloc)
1824 1825
{
	unsigned long found_bits = 0;
1826
	unsigned long max_bits = 0;
1827 1828
	unsigned long bits, i;
	unsigned long next_zero;
1829
	unsigned long extent_bits;
1830

1831 1832 1833 1834
	/*
	 * Skip searching the bitmap if we don't have a contiguous section that
	 * is large enough for this allocation.
	 */
1835 1836
	if (for_alloc &&
	    bitmap_info->max_extent_size &&
1837 1838 1839 1840 1841
	    bitmap_info->max_extent_size < *bytes) {
		*bytes = bitmap_info->max_extent_size;
		return -1;
	}

1842
	i = offset_to_bit(bitmap_info->offset, ctl->unit,
1843
			  max_t(u64, *offset, bitmap_info->offset));
1844
	bits = bytes_to_bits(*bytes, ctl->unit);
1845

1846
	for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
1847 1848 1849 1850
		if (for_alloc && bits == 1) {
			found_bits = 1;
			break;
		}
1851 1852
		next_zero = find_next_zero_bit(bitmap_info->bitmap,
					       BITS_PER_BITMAP, i);
1853 1854 1855
		extent_bits = next_zero - i;
		if (extent_bits >= bits) {
			found_bits = extent_bits;
1856
			break;
1857 1858
		} else if (extent_bits > max_bits) {
			max_bits = extent_bits;
1859 1860 1861 1862 1863
		}
		i = next_zero;
	}

	if (found_bits) {
1864 1865
		*offset = (u64)(i * ctl->unit) + bitmap_info->offset;
		*bytes = (u64)(found_bits) * ctl->unit;
1866 1867 1868
		return 0;
	}

1869
	*bytes = (u64)(max_bits) * ctl->unit;
1870
	bitmap_info->max_extent_size = *bytes;
1871 1872 1873
	return -1;
}

J
Josef Bacik 已提交
1874 1875 1876 1877 1878 1879 1880
static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
{
	if (entry->bitmap)
		return entry->max_extent_size;
	return entry->bytes;
}

1881
/* Cache the size of the max extent in bytes */
1882
static struct btrfs_free_space *
D
David Woodhouse 已提交
1883
find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
1884
		unsigned long align, u64 *max_extent_size)
1885 1886 1887
{
	struct btrfs_free_space *entry;
	struct rb_node *node;
D
David Woodhouse 已提交
1888 1889
	u64 tmp;
	u64 align_off;
1890 1891
	int ret;

1892
	if (!ctl->free_space_offset.rb_node)
1893
		goto out;
1894

1895
	entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
1896
	if (!entry)
1897
		goto out;
1898 1899 1900

	for (node = &entry->offset_index; node; node = rb_next(node)) {
		entry = rb_entry(node, struct btrfs_free_space, offset_index);
1901
		if (entry->bytes < *bytes) {
J
Josef Bacik 已提交
1902 1903
			*max_extent_size = max(get_max_extent_size(entry),
					       *max_extent_size);
1904
			continue;
1905
		}
1906

D
David Woodhouse 已提交
1907 1908 1909 1910
		/* make sure the space returned is big enough
		 * to match our requested alignment
		 */
		if (*bytes >= align) {
1911
			tmp = entry->offset - ctl->start + align - 1;
1912
			tmp = div64_u64(tmp, align);
D
David Woodhouse 已提交
1913 1914 1915 1916 1917 1918 1919
			tmp = tmp * align + ctl->start;
			align_off = tmp - entry->offset;
		} else {
			align_off = 0;
			tmp = entry->offset;
		}

1920
		if (entry->bytes < *bytes + align_off) {
J
Josef Bacik 已提交
1921 1922
			*max_extent_size = max(get_max_extent_size(entry),
					       *max_extent_size);
D
David Woodhouse 已提交
1923
			continue;
1924
		}
D
David Woodhouse 已提交
1925

1926
		if (entry->bitmap) {
1927 1928
			u64 size = *bytes;

1929
			ret = search_bitmap(ctl, entry, &tmp, &size, true);
D
David Woodhouse 已提交
1930 1931
			if (!ret) {
				*offset = tmp;
1932
				*bytes = size;
1933
				return entry;
J
Josef Bacik 已提交
1934 1935 1936 1937
			} else {
				*max_extent_size =
					max(get_max_extent_size(entry),
					    *max_extent_size);
D
David Woodhouse 已提交
1938
			}
1939 1940 1941
			continue;
		}

D
David Woodhouse 已提交
1942 1943
		*offset = tmp;
		*bytes = entry->bytes - align_off;
1944 1945
		return entry;
	}
1946
out:
1947 1948 1949
	return NULL;
}

1950
static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
1951 1952
			   struct btrfs_free_space *info, u64 offset)
{
1953
	info->offset = offset_to_bitmap(ctl, offset);
J
Josef Bacik 已提交
1954
	info->bytes = 0;
1955
	info->bitmap_extents = 0;
1956
	INIT_LIST_HEAD(&info->list);
1957 1958
	link_free_space(ctl, info);
	ctl->total_bitmaps++;
1959
	recalculate_thresholds(ctl);
1960 1961
}

1962
static void free_bitmap(struct btrfs_free_space_ctl *ctl,
1963 1964
			struct btrfs_free_space *bitmap_info)
{
1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976
	/*
	 * Normally when this is called, the bitmap is completely empty. However,
	 * if we are blowing up the free space cache for one reason or another
	 * via __btrfs_remove_free_space_cache(), then it may not be freed and
	 * we may leave stats on the table.
	 */
	if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) {
		ctl->discardable_extents[BTRFS_STAT_CURR] -=
			bitmap_info->bitmap_extents;
		ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes;

	}
1977
	unlink_free_space(ctl, bitmap_info);
1978
	kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
1979
	kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
1980
	ctl->total_bitmaps--;
1981
	recalculate_thresholds(ctl);
1982 1983
}

1984
static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
1985 1986 1987 1988
			      struct btrfs_free_space *bitmap_info,
			      u64 *offset, u64 *bytes)
{
	u64 end;
1989 1990
	u64 search_start, search_bytes;
	int ret;
1991 1992

again:
1993
	end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
1994

1995
	/*
1996 1997 1998 1999
	 * We need to search for bits in this bitmap.  We could only cover some
	 * of the extent in this bitmap thanks to how we add space, so we need
	 * to search for as much as it as we can and clear that amount, and then
	 * go searching for the next bit.
2000 2001
	 */
	search_start = *offset;
2002
	search_bytes = ctl->unit;
2003
	search_bytes = min(search_bytes, end - search_start + 1);
2004 2005
	ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
			    false);
2006 2007
	if (ret < 0 || search_start != *offset)
		return -EINVAL;
2008

2009 2010 2011 2012 2013 2014 2015 2016 2017
	/* We may have found more bits than what we need */
	search_bytes = min(search_bytes, *bytes);

	/* Cannot clear past the end of the bitmap */
	search_bytes = min(search_bytes, end - search_start + 1);

	bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
	*offset += search_bytes;
	*bytes -= search_bytes;
2018 2019

	if (*bytes) {
2020
		struct rb_node *next = rb_next(&bitmap_info->offset_index);
2021
		if (!bitmap_info->bytes)
2022
			free_bitmap(ctl, bitmap_info);
2023

2024 2025 2026 2027 2028
		/*
		 * no entry after this bitmap, but we still have bytes to
		 * remove, so something has gone wrong.
		 */
		if (!next)
2029 2030
			return -EINVAL;

2031 2032 2033 2034 2035 2036 2037
		bitmap_info = rb_entry(next, struct btrfs_free_space,
				       offset_index);

		/*
		 * if the next entry isn't a bitmap we need to return to let the
		 * extent stuff do its work.
		 */
2038 2039 2040
		if (!bitmap_info->bitmap)
			return -EAGAIN;

2041 2042 2043 2044 2045 2046 2047
		/*
		 * Ok the next item is a bitmap, but it may not actually hold
		 * the information for the rest of this free space stuff, so
		 * look for it, and if we don't find it return so we can try
		 * everything over again.
		 */
		search_start = *offset;
2048
		search_bytes = ctl->unit;
2049
		ret = search_bitmap(ctl, bitmap_info, &search_start,
2050
				    &search_bytes, false);
2051 2052 2053
		if (ret < 0 || search_start != *offset)
			return -EAGAIN;

2054
		goto again;
2055
	} else if (!bitmap_info->bytes)
2056
		free_bitmap(ctl, bitmap_info);
2057 2058 2059 2060

	return 0;
}

J
Josef Bacik 已提交
2061 2062
static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
			       struct btrfs_free_space *info, u64 offset,
2063
			       u64 bytes, enum btrfs_trim_state trim_state)
J
Josef Bacik 已提交
2064 2065 2066 2067
{
	u64 bytes_to_set = 0;
	u64 end;

2068 2069 2070 2071
	/*
	 * This is a tradeoff to make bitmap trim state minimal.  We mark the
	 * whole bitmap untrimmed if at any point we add untrimmed regions.
	 */
2072
	if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) {
2073
		if (btrfs_free_space_trimmed(info)) {
2074 2075
			ctl->discardable_extents[BTRFS_STAT_CURR] +=
				info->bitmap_extents;
2076 2077
			ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
		}
2078
		info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2079
	}
2080

J
Josef Bacik 已提交
2081 2082 2083 2084 2085 2086
	end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);

	bytes_to_set = min(end - offset, bytes);

	bitmap_set_bits(ctl, info, offset, bytes_to_set);

2087 2088 2089 2090 2091 2092
	/*
	 * We set some bytes, we have no idea what the max extent size is
	 * anymore.
	 */
	info->max_extent_size = 0;

J
Josef Bacik 已提交
2093 2094 2095 2096
	return bytes_to_set;

}

2097 2098
static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
		      struct btrfs_free_space *info)
2099
{
2100
	struct btrfs_block_group *block_group = ctl->private;
2101
	struct btrfs_fs_info *fs_info = block_group->fs_info;
2102 2103 2104
	bool forced = false;

#ifdef CONFIG_BTRFS_DEBUG
2105
	if (btrfs_should_fragment_free_space(block_group))
2106 2107
		forced = true;
#endif
2108

2109 2110 2111 2112
	/* This is a way to reclaim large regions from the bitmaps. */
	if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD)
		return false;

2113 2114 2115 2116
	/*
	 * If we are below the extents threshold then we can add this as an
	 * extent, and don't have to deal with the bitmap
	 */
2117
	if (!forced && ctl->free_extents < ctl->extents_thresh) {
2118 2119 2120
		/*
		 * If this block group has some small extents we don't want to
		 * use up all of our free slots in the cache with them, we want
2121
		 * to reserve them to larger extents, however if we have plenty
2122 2123 2124
		 * of cache left then go ahead an dadd them, no sense in adding
		 * the overhead of a bitmap if we don't have to.
		 */
2125 2126
		if (info->bytes <= fs_info->sectorsize * 8) {
			if (ctl->free_extents * 3 <= ctl->extents_thresh)
2127
				return false;
2128
		} else {
2129
			return false;
2130 2131
		}
	}
2132 2133

	/*
2134 2135 2136 2137 2138 2139
	 * The original block groups from mkfs can be really small, like 8
	 * megabytes, so don't bother with a bitmap for those entries.  However
	 * some block groups can be smaller than what a bitmap would cover but
	 * are still large enough that they could overflow the 32k memory limit,
	 * so allow those block groups to still be allowed to have a bitmap
	 * entry.
2140
	 */
2141
	if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length)
2142 2143 2144 2145 2146
		return false;

	return true;
}

2147
static const struct btrfs_free_space_op free_space_op = {
J
Josef Bacik 已提交
2148 2149 2150
	.use_bitmap		= use_bitmap,
};

2151 2152 2153 2154
static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
			      struct btrfs_free_space *info)
{
	struct btrfs_free_space *bitmap_info;
2155
	struct btrfs_block_group *block_group = NULL;
2156
	int added = 0;
J
Josef Bacik 已提交
2157
	u64 bytes, offset, bytes_added;
2158
	enum btrfs_trim_state trim_state;
2159
	int ret;
2160 2161 2162

	bytes = info->bytes;
	offset = info->offset;
2163
	trim_state = info->trim_state;
2164

2165 2166 2167
	if (!ctl->op->use_bitmap(ctl, info))
		return 0;

J
Josef Bacik 已提交
2168 2169
	if (ctl->op == &free_space_op)
		block_group = ctl->private;
2170
again:
J
Josef Bacik 已提交
2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187
	/*
	 * Since we link bitmaps right into the cluster we need to see if we
	 * have a cluster here, and if so and it has our bitmap we need to add
	 * the free space to that bitmap.
	 */
	if (block_group && !list_empty(&block_group->cluster_list)) {
		struct btrfs_free_cluster *cluster;
		struct rb_node *node;
		struct btrfs_free_space *entry;

		cluster = list_entry(block_group->cluster_list.next,
				     struct btrfs_free_cluster,
				     block_group_list);
		spin_lock(&cluster->lock);
		node = rb_first(&cluster->root);
		if (!node) {
			spin_unlock(&cluster->lock);
2188
			goto no_cluster_bitmap;
J
Josef Bacik 已提交
2189 2190 2191 2192 2193
		}

		entry = rb_entry(node, struct btrfs_free_space, offset_index);
		if (!entry->bitmap) {
			spin_unlock(&cluster->lock);
2194
			goto no_cluster_bitmap;
J
Josef Bacik 已提交
2195 2196 2197
		}

		if (entry->offset == offset_to_bitmap(ctl, offset)) {
2198 2199
			bytes_added = add_bytes_to_bitmap(ctl, entry, offset,
							  bytes, trim_state);
J
Josef Bacik 已提交
2200 2201 2202 2203 2204 2205 2206 2207 2208
			bytes -= bytes_added;
			offset += bytes_added;
		}
		spin_unlock(&cluster->lock);
		if (!bytes) {
			ret = 1;
			goto out;
		}
	}
2209 2210

no_cluster_bitmap:
2211
	bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2212 2213
					 1, 0);
	if (!bitmap_info) {
2214
		ASSERT(added == 0);
2215 2216 2217
		goto new_bitmap;
	}

2218 2219
	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
					  trim_state);
J
Josef Bacik 已提交
2220 2221 2222
	bytes -= bytes_added;
	offset += bytes_added;
	added = 0;
2223 2224 2225 2226 2227 2228 2229 2230 2231

	if (!bytes) {
		ret = 1;
		goto out;
	} else
		goto again;

new_bitmap:
	if (info && info->bitmap) {
2232
		add_new_bitmap(ctl, info, offset);
2233 2234 2235 2236
		added = 1;
		info = NULL;
		goto again;
	} else {
2237
		spin_unlock(&ctl->tree_lock);
2238 2239 2240

		/* no pre-allocated info, allocate a new one */
		if (!info) {
2241 2242
			info = kmem_cache_zalloc(btrfs_free_space_cachep,
						 GFP_NOFS);
2243
			if (!info) {
2244
				spin_lock(&ctl->tree_lock);
2245 2246 2247 2248 2249 2250
				ret = -ENOMEM;
				goto out;
			}
		}

		/* allocate the bitmap */
2251 2252
		info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep,
						 GFP_NOFS);
2253
		info->trim_state = BTRFS_TRIM_STATE_TRIMMED;
2254
		spin_lock(&ctl->tree_lock);
2255 2256 2257 2258 2259 2260 2261 2262 2263
		if (!info->bitmap) {
			ret = -ENOMEM;
			goto out;
		}
		goto again;
	}

out:
	if (info) {
2264 2265 2266
		if (info->bitmap)
			kmem_cache_free(btrfs_free_space_bitmap_cachep,
					info->bitmap);
2267
		kmem_cache_free(btrfs_free_space_cachep, info);
2268
	}
J
Josef Bacik 已提交
2269 2270 2271 2272

	return ret;
}

2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288
/*
 * Free space merging rules:
 *  1) Merge trimmed areas together
 *  2) Let untrimmed areas coalesce with trimmed areas
 *  3) Always pull neighboring regions from bitmaps
 *
 * The above rules are for when we merge free space based on btrfs_trim_state.
 * Rules 2 and 3 are subtle because they are suboptimal, but are done for the
 * same reason: to promote larger extent regions which makes life easier for
 * find_free_extent().  Rule 2 enables coalescing based on the common path
 * being returning free space from btrfs_finish_extent_commit().  So when free
 * space is trimmed, it will prevent aggregating trimmed new region and
 * untrimmed regions in the rb_tree.  Rule 3 is purely to obtain larger extents
 * and provide find_free_extent() with the largest extents possible hoping for
 * the reuse path.
 */
2289
static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
2290
			  struct btrfs_free_space *info, bool update_stat)
J
Josef Bacik 已提交
2291
{
2292
	struct btrfs_free_space *left_info = NULL;
2293 2294 2295 2296
	struct btrfs_free_space *right_info;
	bool merged = false;
	u64 offset = info->offset;
	u64 bytes = info->bytes;
2297
	const bool is_trimmed = btrfs_free_space_trimmed(info);
2298

J
Josef Bacik 已提交
2299 2300 2301 2302 2303
	/*
	 * first we want to see if there is free space adjacent to the range we
	 * are adding, if there is remove that struct and add a new one to
	 * cover the entire range
	 */
2304
	right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
2305 2306 2307
	if (right_info && rb_prev(&right_info->offset_index))
		left_info = rb_entry(rb_prev(&right_info->offset_index),
				     struct btrfs_free_space, offset_index);
2308
	else if (!right_info)
2309
		left_info = tree_search_offset(ctl, offset - 1, 0, 0);
J
Josef Bacik 已提交
2310

2311 2312 2313
	/* See try_merge_free_space() comment. */
	if (right_info && !right_info->bitmap &&
	    (!is_trimmed || btrfs_free_space_trimmed(right_info))) {
2314
		if (update_stat)
2315
			unlink_free_space(ctl, right_info);
2316
		else
2317
			__unlink_free_space(ctl, right_info);
2318
		info->bytes += right_info->bytes;
2319
		kmem_cache_free(btrfs_free_space_cachep, right_info);
2320
		merged = true;
J
Josef Bacik 已提交
2321 2322
	}

2323
	/* See try_merge_free_space() comment. */
2324
	if (left_info && !left_info->bitmap &&
2325 2326
	    left_info->offset + left_info->bytes == offset &&
	    (!is_trimmed || btrfs_free_space_trimmed(left_info))) {
2327
		if (update_stat)
2328
			unlink_free_space(ctl, left_info);
2329
		else
2330
			__unlink_free_space(ctl, left_info);
2331 2332
		info->offset = left_info->offset;
		info->bytes += left_info->bytes;
2333
		kmem_cache_free(btrfs_free_space_cachep, left_info);
2334
		merged = true;
J
Josef Bacik 已提交
2335 2336
	}

2337 2338 2339
	return merged;
}

2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361
static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
				     struct btrfs_free_space *info,
				     bool update_stat)
{
	struct btrfs_free_space *bitmap;
	unsigned long i;
	unsigned long j;
	const u64 end = info->offset + info->bytes;
	const u64 bitmap_offset = offset_to_bitmap(ctl, end);
	u64 bytes;

	bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
	if (!bitmap)
		return false;

	i = offset_to_bit(bitmap->offset, ctl->unit, end);
	j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
	if (j == i)
		return false;
	bytes = (j - i) * ctl->unit;
	info->bytes += bytes;

2362 2363 2364 2365
	/* See try_merge_free_space() comment. */
	if (!btrfs_free_space_trimmed(bitmap))
		info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;

2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418
	if (update_stat)
		bitmap_clear_bits(ctl, bitmap, end, bytes);
	else
		__bitmap_clear_bits(ctl, bitmap, end, bytes);

	if (!bitmap->bytes)
		free_bitmap(ctl, bitmap);

	return true;
}

static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
				       struct btrfs_free_space *info,
				       bool update_stat)
{
	struct btrfs_free_space *bitmap;
	u64 bitmap_offset;
	unsigned long i;
	unsigned long j;
	unsigned long prev_j;
	u64 bytes;

	bitmap_offset = offset_to_bitmap(ctl, info->offset);
	/* If we're on a boundary, try the previous logical bitmap. */
	if (bitmap_offset == info->offset) {
		if (info->offset == 0)
			return false;
		bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
	}

	bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
	if (!bitmap)
		return false;

	i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
	j = 0;
	prev_j = (unsigned long)-1;
	for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
		if (j > i)
			break;
		prev_j = j;
	}
	if (prev_j == i)
		return false;

	if (prev_j == (unsigned long)-1)
		bytes = (i + 1) * ctl->unit;
	else
		bytes = (i - prev_j) * ctl->unit;

	info->offset -= bytes;
	info->bytes += bytes;

2419 2420 2421 2422
	/* See try_merge_free_space() comment. */
	if (!btrfs_free_space_trimmed(bitmap))
		info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;

2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469
	if (update_stat)
		bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
	else
		__bitmap_clear_bits(ctl, bitmap, info->offset, bytes);

	if (!bitmap->bytes)
		free_bitmap(ctl, bitmap);

	return true;
}

/*
 * We prefer always to allocate from extent entries, both for clustered and
 * non-clustered allocation requests. So when attempting to add a new extent
 * entry, try to see if there's adjacent free space in bitmap entries, and if
 * there is, migrate that space from the bitmaps to the extent.
 * Like this we get better chances of satisfying space allocation requests
 * because we attempt to satisfy them based on a single cache entry, and never
 * on 2 or more entries - even if the entries represent a contiguous free space
 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
 * ends).
 */
static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
			      struct btrfs_free_space *info,
			      bool update_stat)
{
	/*
	 * Only work with disconnected entries, as we can change their offset,
	 * and must be extent entries.
	 */
	ASSERT(!info->bitmap);
	ASSERT(RB_EMPTY_NODE(&info->offset_index));

	if (ctl->total_bitmaps > 0) {
		bool stole_end;
		bool stole_front = false;

		stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
		if (ctl->total_bitmaps > 0)
			stole_front = steal_from_bitmap_to_front(ctl, info,
								 update_stat);

		if (stole_end || stole_front)
			try_merge_free_space(ctl, info, update_stat);
	}
}

2470 2471
int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
			   struct btrfs_free_space_ctl *ctl,
2472 2473
			   u64 offset, u64 bytes,
			   enum btrfs_trim_state trim_state)
2474
{
2475
	struct btrfs_block_group *block_group = ctl->private;
2476 2477
	struct btrfs_free_space *info;
	int ret = 0;
D
Dennis Zhou 已提交
2478
	u64 filter_bytes = bytes;
2479

2480 2481
	ASSERT(!btrfs_is_zoned(fs_info));

2482
	info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
2483 2484 2485 2486 2487
	if (!info)
		return -ENOMEM;

	info->offset = offset;
	info->bytes = bytes;
2488
	info->trim_state = trim_state;
2489
	RB_CLEAR_NODE(&info->offset_index);
2490

2491
	spin_lock(&ctl->tree_lock);
2492

2493
	if (try_merge_free_space(ctl, info, true))
2494 2495 2496 2497 2498 2499 2500
		goto link;

	/*
	 * There was no extent directly to the left or right of this new
	 * extent then we know we're going to have to allocate a new extent, so
	 * before we do that see if we need to drop this into a bitmap
	 */
2501
	ret = insert_into_bitmap(ctl, info);
2502 2503 2504 2505 2506 2507 2508
	if (ret < 0) {
		goto out;
	} else if (ret) {
		ret = 0;
		goto out;
	}
link:
2509 2510 2511 2512 2513 2514 2515 2516
	/*
	 * Only steal free space from adjacent bitmaps if we're sure we're not
	 * going to add the new free space to existing bitmap entries - because
	 * that would mean unnecessary work that would be reverted. Therefore
	 * attempt to steal space from bitmaps if we're adding an extent entry.
	 */
	steal_from_bitmap(ctl, info, true);

D
Dennis Zhou 已提交
2517 2518
	filter_bytes = max(filter_bytes, info->bytes);

2519
	ret = link_free_space(ctl, info);
J
Josef Bacik 已提交
2520
	if (ret)
2521
		kmem_cache_free(btrfs_free_space_cachep, info);
2522
out:
2523
	btrfs_discard_update_discardable(block_group);
2524
	spin_unlock(&ctl->tree_lock);
2525

J
Josef Bacik 已提交
2526
	if (ret) {
2527
		btrfs_crit(fs_info, "unable to add free space :%d", ret);
2528
		ASSERT(ret != -EEXIST);
J
Josef Bacik 已提交
2529 2530
	}

D
Dennis Zhou 已提交
2531 2532
	if (trim_state != BTRFS_TRIM_STATE_TRIMMED) {
		btrfs_discard_check_filter(block_group, filter_bytes);
2533
		btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
D
Dennis Zhou 已提交
2534
	}
2535

J
Josef Bacik 已提交
2536 2537 2538
	return ret;
}

2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557
static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
					u64 bytenr, u64 size, bool used)
{
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
	u64 offset = bytenr - block_group->start;
	u64 to_free, to_unusable;

	spin_lock(&ctl->tree_lock);
	if (!used)
		to_free = size;
	else if (offset >= block_group->alloc_offset)
		to_free = size;
	else if (offset + size <= block_group->alloc_offset)
		to_free = 0;
	else
		to_free = offset + size - block_group->alloc_offset;
	to_unusable = size - to_free;

	ctl->free_space += to_free;
2558 2559 2560 2561 2562 2563
	/*
	 * If the block group is read-only, we should account freed space into
	 * bytes_readonly.
	 */
	if (!block_group->ro)
		block_group->zone_unusable += to_unusable;
2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577
	spin_unlock(&ctl->tree_lock);
	if (!used) {
		spin_lock(&block_group->lock);
		block_group->alloc_offset -= size;
		spin_unlock(&block_group->lock);
	}

	/* All the region is now unusable. Mark it as unused and reclaim */
	if (block_group->zone_unusable == block_group->length)
		btrfs_mark_bg_unused(block_group);

	return 0;
}

2578
int btrfs_add_free_space(struct btrfs_block_group *block_group,
2579 2580
			 u64 bytenr, u64 size)
{
2581 2582
	enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;

2583 2584 2585 2586
	if (btrfs_is_zoned(block_group->fs_info))
		return __btrfs_add_free_space_zoned(block_group, bytenr, size,
						    true);

2587 2588 2589
	if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC))
		trim_state = BTRFS_TRIM_STATE_TRIMMED;

2590 2591
	return __btrfs_add_free_space(block_group->fs_info,
				      block_group->free_space_ctl,
2592
				      bytenr, size, trim_state);
2593 2594
}

2595 2596 2597 2598 2599 2600 2601 2602 2603 2604
int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
				u64 bytenr, u64 size)
{
	if (btrfs_is_zoned(block_group->fs_info))
		return __btrfs_add_free_space_zoned(block_group, bytenr, size,
						    false);

	return btrfs_add_free_space(block_group, bytenr, size);
}

2605 2606 2607 2608 2609 2610 2611 2612 2613 2614
/*
 * This is a subtle distinction because when adding free space back in general,
 * we want it to be added as untrimmed for async. But in the case where we add
 * it on loading of a block group, we want to consider it trimmed.
 */
int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
				       u64 bytenr, u64 size)
{
	enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;

2615 2616 2617 2618
	if (btrfs_is_zoned(block_group->fs_info))
		return __btrfs_add_free_space_zoned(block_group, bytenr, size,
						    true);

2619 2620 2621 2622 2623 2624 2625 2626 2627
	if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) ||
	    btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
		trim_state = BTRFS_TRIM_STATE_TRIMMED;

	return __btrfs_add_free_space(block_group->fs_info,
				      block_group->free_space_ctl,
				      bytenr, size, trim_state);
}

2628
int btrfs_remove_free_space(struct btrfs_block_group *block_group,
2629
			    u64 offset, u64 bytes)
J
Josef Bacik 已提交
2630
{
2631
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
J
Josef Bacik 已提交
2632
	struct btrfs_free_space *info;
2633 2634
	int ret;
	bool re_search = false;
J
Josef Bacik 已提交
2635

2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649
	if (btrfs_is_zoned(block_group->fs_info)) {
		/*
		 * This can happen with conventional zones when replaying log.
		 * Since the allocation info of tree-log nodes are not recorded
		 * to the extent-tree, calculate_alloc_pointer() failed to
		 * advance the allocation pointer after last allocated tree log
		 * node blocks.
		 *
		 * This function is called from
		 * btrfs_pin_extent_for_log_replay() when replaying the log.
		 * Advance the pointer not to overwrite the tree-log nodes.
		 */
		if (block_group->alloc_offset < offset + bytes)
			block_group->alloc_offset = offset + bytes;
2650
		return 0;
2651
	}
2652

2653
	spin_lock(&ctl->tree_lock);
2654

2655
again:
2656
	ret = 0;
2657 2658 2659
	if (!bytes)
		goto out_lock;

2660
	info = tree_search_offset(ctl, offset, 0, 0);
2661
	if (!info) {
2662 2663 2664 2665
		/*
		 * oops didn't find an extent that matched the space we wanted
		 * to remove, look for a bitmap instead
		 */
2666
		info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2667 2668
					  1, 0);
		if (!info) {
2669 2670 2671 2672
			/*
			 * If we found a partial bit of our free space in a
			 * bitmap but then couldn't find the other part this may
			 * be a problem, so WARN about it.
2673
			 */
2674
			WARN_ON(re_search);
2675 2676
			goto out_lock;
		}
2677 2678
	}

2679
	re_search = false;
2680
	if (!info->bitmap) {
2681
		unlink_free_space(ctl, info);
2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692
		if (offset == info->offset) {
			u64 to_free = min(bytes, info->bytes);

			info->bytes -= to_free;
			info->offset += to_free;
			if (info->bytes) {
				ret = link_free_space(ctl, info);
				WARN_ON(ret);
			} else {
				kmem_cache_free(btrfs_free_space_cachep, info);
			}
J
Josef Bacik 已提交
2693

2694 2695 2696 2697 2698
			offset += to_free;
			bytes -= to_free;
			goto again;
		} else {
			u64 old_end = info->bytes + info->offset;
2699

2700
			info->bytes = offset - info->offset;
2701
			ret = link_free_space(ctl, info);
2702 2703 2704 2705
			WARN_ON(ret);
			if (ret)
				goto out_lock;

2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716
			/* Not enough bytes in this entry to satisfy us */
			if (old_end < offset + bytes) {
				bytes -= old_end - offset;
				offset = old_end;
				goto again;
			} else if (old_end == offset + bytes) {
				/* all done */
				goto out_lock;
			}
			spin_unlock(&ctl->tree_lock);

2717 2718 2719 2720
			ret = __btrfs_add_free_space(block_group->fs_info, ctl,
						     offset + bytes,
						     old_end - (offset + bytes),
						     info->trim_state);
2721 2722 2723
			WARN_ON(ret);
			goto out;
		}
J
Josef Bacik 已提交
2724
	}
2725

2726
	ret = remove_from_bitmap(ctl, info, &offset, &bytes);
2727 2728
	if (ret == -EAGAIN) {
		re_search = true;
2729
		goto again;
2730
	}
2731
out_lock:
2732
	btrfs_discard_update_discardable(block_group);
2733
	spin_unlock(&ctl->tree_lock);
J
Josef Bacik 已提交
2734
out:
2735 2736 2737
	return ret;
}

2738
void btrfs_dump_free_space(struct btrfs_block_group *block_group,
J
Josef Bacik 已提交
2739 2740
			   u64 bytes)
{
2741
	struct btrfs_fs_info *fs_info = block_group->fs_info;
2742
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
J
Josef Bacik 已提交
2743 2744 2745 2746
	struct btrfs_free_space *info;
	struct rb_node *n;
	int count = 0;

2747 2748 2749 2750 2751 2752 2753 2754 2755 2756
	/*
	 * Zoned btrfs does not use free space tree and cluster. Just print
	 * out the free space after the allocation offset.
	 */
	if (btrfs_is_zoned(fs_info)) {
		btrfs_info(fs_info, "free space %llu",
			   block_group->length - block_group->alloc_offset);
		return;
	}

2757
	spin_lock(&ctl->tree_lock);
2758
	for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
J
Josef Bacik 已提交
2759
		info = rb_entry(n, struct btrfs_free_space, offset_index);
L
Liu Bo 已提交
2760
		if (info->bytes >= bytes && !block_group->ro)
J
Josef Bacik 已提交
2761
			count++;
2762
		btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
2763
			   info->offset, info->bytes,
2764
		       (info->bitmap) ? "yes" : "no");
J
Josef Bacik 已提交
2765
	}
2766
	spin_unlock(&ctl->tree_lock);
2767
	btrfs_info(fs_info, "block group has cluster?: %s",
2768
	       list_empty(&block_group->cluster_list) ? "no" : "yes");
2769
	btrfs_info(fs_info,
2770
		   "%d blocks of free space at or bigger than bytes is", count);
J
Josef Bacik 已提交
2771 2772
}

2773 2774
void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
			       struct btrfs_free_space_ctl *ctl)
J
Josef Bacik 已提交
2775
{
2776
	struct btrfs_fs_info *fs_info = block_group->fs_info;
J
Josef Bacik 已提交
2777

2778
	spin_lock_init(&ctl->tree_lock);
2779
	ctl->unit = fs_info->sectorsize;
2780
	ctl->start = block_group->start;
2781 2782
	ctl->private = block_group;
	ctl->op = &free_space_op;
2783 2784
	INIT_LIST_HEAD(&ctl->trimming_ranges);
	mutex_init(&ctl->cache_writeout_mutex);
J
Josef Bacik 已提交
2785

2786 2787 2788 2789 2790
	/*
	 * we only want to have 32k of ram per block group for keeping
	 * track of free space, and if we pass 1/2 of that we want to
	 * start converting things over to using bitmaps
	 */
2791
	ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
J
Josef Bacik 已提交
2792 2793
}

2794 2795 2796 2797 2798 2799
/*
 * for a given cluster, put all of its extents back into the free
 * space cache.  If the block group passed doesn't match the block group
 * pointed to by the cluster, someone else raced in and freed the
 * cluster already.  In that case, we just return without changing anything
 */
2800
static void __btrfs_return_cluster_to_free_space(
2801
			     struct btrfs_block_group *block_group,
2802 2803
			     struct btrfs_free_cluster *cluster)
{
2804
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2805 2806 2807 2808
	struct btrfs_free_space *entry;
	struct rb_node *node;

	spin_lock(&cluster->lock);
2809 2810 2811 2812
	if (cluster->block_group != block_group) {
		spin_unlock(&cluster->lock);
		return;
	}
2813

2814
	cluster->block_group = NULL;
2815
	cluster->window_start = 0;
2816 2817
	list_del_init(&cluster->block_group_list);

2818
	node = rb_first(&cluster->root);
2819
	while (node) {
2820 2821
		bool bitmap;

2822 2823 2824
		entry = rb_entry(node, struct btrfs_free_space, offset_index);
		node = rb_next(&entry->offset_index);
		rb_erase(&entry->offset_index, &cluster->root);
2825
		RB_CLEAR_NODE(&entry->offset_index);
2826 2827

		bitmap = (entry->bitmap != NULL);
2828
		if (!bitmap) {
2829
			/* Merging treats extents as if they were new */
2830
			if (!btrfs_free_space_trimmed(entry)) {
2831
				ctl->discardable_extents[BTRFS_STAT_CURR]--;
2832 2833 2834
				ctl->discardable_bytes[BTRFS_STAT_CURR] -=
					entry->bytes;
			}
2835

2836
			try_merge_free_space(ctl, entry, false);
2837
			steal_from_bitmap(ctl, entry, false);
2838 2839

			/* As we insert directly, update these statistics */
2840
			if (!btrfs_free_space_trimmed(entry)) {
2841
				ctl->discardable_extents[BTRFS_STAT_CURR]++;
2842 2843 2844
				ctl->discardable_bytes[BTRFS_STAT_CURR] +=
					entry->bytes;
			}
2845
		}
2846
		tree_insert_offset(&ctl->free_space_offset,
2847
				   entry->offset, &entry->offset_index, bitmap);
2848
	}
2849
	cluster->root = RB_ROOT;
2850
	spin_unlock(&cluster->lock);
2851
	btrfs_put_block_group(block_group);
2852 2853
}

2854 2855
static void __btrfs_remove_free_space_cache_locked(
				struct btrfs_free_space_ctl *ctl)
J
Josef Bacik 已提交
2856 2857 2858
{
	struct btrfs_free_space *info;
	struct rb_node *node;
2859 2860 2861

	while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
		info = rb_entry(node, struct btrfs_free_space, offset_index);
2862 2863 2864 2865 2866 2867
		if (!info->bitmap) {
			unlink_free_space(ctl, info);
			kmem_cache_free(btrfs_free_space_cachep, info);
		} else {
			free_bitmap(ctl, info);
		}
2868 2869

		cond_resched_lock(&ctl->tree_lock);
2870
	}
2871 2872 2873 2874 2875 2876
}

void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
{
	spin_lock(&ctl->tree_lock);
	__btrfs_remove_free_space_cache_locked(ctl);
2877
	if (ctl->private)
2878
		btrfs_discard_update_discardable(ctl->private);
2879 2880 2881
	spin_unlock(&ctl->tree_lock);
}

2882
void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
2883 2884
{
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2885
	struct btrfs_free_cluster *cluster;
2886
	struct list_head *head;
J
Josef Bacik 已提交
2887

2888
	spin_lock(&ctl->tree_lock);
2889 2890 2891 2892
	while ((head = block_group->cluster_list.next) !=
	       &block_group->cluster_list) {
		cluster = list_entry(head, struct btrfs_free_cluster,
				     block_group_list);
2893 2894 2895

		WARN_ON(cluster->block_group != block_group);
		__btrfs_return_cluster_to_free_space(block_group, cluster);
2896 2897

		cond_resched_lock(&ctl->tree_lock);
2898
	}
2899
	__btrfs_remove_free_space_cache_locked(ctl);
2900
	btrfs_discard_update_discardable(block_group);
2901
	spin_unlock(&ctl->tree_lock);
2902

J
Josef Bacik 已提交
2903 2904
}

2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935
/**
 * btrfs_is_free_space_trimmed - see if everything is trimmed
 * @block_group: block_group of interest
 *
 * Walk @block_group's free space rb_tree to determine if everything is trimmed.
 */
bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
{
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
	struct btrfs_free_space *info;
	struct rb_node *node;
	bool ret = true;

	spin_lock(&ctl->tree_lock);
	node = rb_first(&ctl->free_space_offset);

	while (node) {
		info = rb_entry(node, struct btrfs_free_space, offset_index);

		if (!btrfs_free_space_trimmed(info)) {
			ret = false;
			break;
		}

		node = rb_next(node);
	}

	spin_unlock(&ctl->tree_lock);
	return ret;
}

2936
u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
2937 2938
			       u64 offset, u64 bytes, u64 empty_size,
			       u64 *max_extent_size)
J
Josef Bacik 已提交
2939
{
2940
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2941 2942
	struct btrfs_discard_ctl *discard_ctl =
					&block_group->fs_info->discard_ctl;
2943
	struct btrfs_free_space *entry = NULL;
2944
	u64 bytes_search = bytes + empty_size;
2945
	u64 ret = 0;
D
David Woodhouse 已提交
2946 2947
	u64 align_gap = 0;
	u64 align_gap_len = 0;
2948
	enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
J
Josef Bacik 已提交
2949

2950 2951
	ASSERT(!btrfs_is_zoned(block_group->fs_info));

2952
	spin_lock(&ctl->tree_lock);
D
David Woodhouse 已提交
2953
	entry = find_free_space(ctl, &offset, &bytes_search,
2954
				block_group->full_stripe_len, max_extent_size);
2955
	if (!entry)
2956 2957 2958 2959
		goto out;

	ret = offset;
	if (entry->bitmap) {
2960
		bitmap_clear_bits(ctl, entry, offset, bytes);
2961 2962 2963 2964

		if (!btrfs_free_space_trimmed(entry))
			atomic64_add(bytes, &discard_ctl->discard_bytes_saved);

2965
		if (!entry->bytes)
2966
			free_bitmap(ctl, entry);
2967
	} else {
2968
		unlink_free_space(ctl, entry);
D
David Woodhouse 已提交
2969 2970
		align_gap_len = offset - entry->offset;
		align_gap = entry->offset;
2971
		align_gap_trim_state = entry->trim_state;
D
David Woodhouse 已提交
2972

2973 2974 2975
		if (!btrfs_free_space_trimmed(entry))
			atomic64_add(bytes, &discard_ctl->discard_bytes_saved);

D
David Woodhouse 已提交
2976 2977 2978 2979
		entry->offset = offset + bytes;
		WARN_ON(entry->bytes < bytes + align_gap_len);

		entry->bytes -= bytes + align_gap_len;
2980
		if (!entry->bytes)
2981
			kmem_cache_free(btrfs_free_space_cachep, entry);
2982
		else
2983
			link_free_space(ctl, entry);
2984
	}
2985
out:
2986
	btrfs_discard_update_discardable(block_group);
2987
	spin_unlock(&ctl->tree_lock);
J
Josef Bacik 已提交
2988

D
David Woodhouse 已提交
2989
	if (align_gap_len)
2990
		__btrfs_add_free_space(block_group->fs_info, ctl,
2991 2992
				       align_gap, align_gap_len,
				       align_gap_trim_state);
J
Josef Bacik 已提交
2993 2994
	return ret;
}
2995 2996 2997 2998 2999 3000 3001 3002 3003

/*
 * given a cluster, put all of its extents back into the free space
 * cache.  If a block group is passed, this function will only free
 * a cluster that belongs to the passed block group.
 *
 * Otherwise, it'll get a reference on the block group pointed to by the
 * cluster and remove the cluster from it.
 */
3004
void btrfs_return_cluster_to_free_space(
3005
			       struct btrfs_block_group *block_group,
3006 3007
			       struct btrfs_free_cluster *cluster)
{
3008
	struct btrfs_free_space_ctl *ctl;
3009 3010 3011 3012 3013 3014 3015

	/* first, get a safe pointer to the block group */
	spin_lock(&cluster->lock);
	if (!block_group) {
		block_group = cluster->block_group;
		if (!block_group) {
			spin_unlock(&cluster->lock);
3016
			return;
3017 3018 3019 3020
		}
	} else if (cluster->block_group != block_group) {
		/* someone else has already freed it don't redo their work */
		spin_unlock(&cluster->lock);
3021
		return;
3022
	}
3023
	btrfs_get_block_group(block_group);
3024 3025
	spin_unlock(&cluster->lock);

3026 3027
	ctl = block_group->free_space_ctl;

3028
	/* now return any extents the cluster had on it */
3029
	spin_lock(&ctl->tree_lock);
3030
	__btrfs_return_cluster_to_free_space(block_group, cluster);
3031
	spin_unlock(&ctl->tree_lock);
3032

3033 3034
	btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group);

3035 3036 3037 3038
	/* finally drop our ref */
	btrfs_put_block_group(block_group);
}

3039
static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
3040
				   struct btrfs_free_cluster *cluster,
3041
				   struct btrfs_free_space *entry,
3042 3043
				   u64 bytes, u64 min_start,
				   u64 *max_extent_size)
3044
{
3045
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3046 3047 3048 3049 3050 3051 3052 3053
	int err;
	u64 search_start = cluster->window_start;
	u64 search_bytes = bytes;
	u64 ret = 0;

	search_start = min_start;
	search_bytes = bytes;

3054
	err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
3055
	if (err) {
J
Josef Bacik 已提交
3056 3057
		*max_extent_size = max(get_max_extent_size(entry),
				       *max_extent_size);
3058
		return 0;
3059
	}
3060 3061

	ret = search_start;
3062
	__bitmap_clear_bits(ctl, entry, ret, bytes);
3063 3064 3065 3066

	return ret;
}

3067 3068 3069 3070 3071
/*
 * given a cluster, try to allocate 'bytes' from it, returns 0
 * if it couldn't find anything suitably large, or a logical disk offset
 * if things worked out
 */
3072
u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
3073
			     struct btrfs_free_cluster *cluster, u64 bytes,
3074
			     u64 min_start, u64 *max_extent_size)
3075
{
3076
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3077 3078
	struct btrfs_discard_ctl *discard_ctl =
					&block_group->fs_info->discard_ctl;
3079 3080 3081 3082
	struct btrfs_free_space *entry = NULL;
	struct rb_node *node;
	u64 ret = 0;

3083 3084
	ASSERT(!btrfs_is_zoned(block_group->fs_info));

3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096
	spin_lock(&cluster->lock);
	if (bytes > cluster->max_size)
		goto out;

	if (cluster->block_group != block_group)
		goto out;

	node = rb_first(&cluster->root);
	if (!node)
		goto out;

	entry = rb_entry(node, struct btrfs_free_space, offset_index);
3097
	while (1) {
J
Josef Bacik 已提交
3098 3099 3100
		if (entry->bytes < bytes)
			*max_extent_size = max(get_max_extent_size(entry),
					       *max_extent_size);
3101

3102 3103
		if (entry->bytes < bytes ||
		    (!entry->bitmap && entry->offset < min_start)) {
3104 3105 3106 3107 3108 3109 3110 3111
			node = rb_next(&entry->offset_index);
			if (!node)
				break;
			entry = rb_entry(node, struct btrfs_free_space,
					 offset_index);
			continue;
		}

3112 3113 3114
		if (entry->bitmap) {
			ret = btrfs_alloc_from_bitmap(block_group,
						      cluster, entry, bytes,
3115 3116
						      cluster->window_start,
						      max_extent_size);
3117 3118 3119 3120 3121 3122 3123 3124
			if (ret == 0) {
				node = rb_next(&entry->offset_index);
				if (!node)
					break;
				entry = rb_entry(node, struct btrfs_free_space,
						 offset_index);
				continue;
			}
3125
			cluster->window_start += bytes;
3126 3127 3128 3129 3130 3131
		} else {
			ret = entry->offset;

			entry->offset += bytes;
			entry->bytes -= bytes;
		}
3132 3133 3134 3135 3136

		break;
	}
out:
	spin_unlock(&cluster->lock);
3137

3138 3139 3140
	if (!ret)
		return 0;

3141
	spin_lock(&ctl->tree_lock);
3142

3143 3144 3145
	if (!btrfs_free_space_trimmed(entry))
		atomic64_add(bytes, &discard_ctl->discard_bytes_saved);

3146
	ctl->free_space -= bytes;
3147 3148
	if (!entry->bitmap && !btrfs_free_space_trimmed(entry))
		ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
3149 3150

	spin_lock(&cluster->lock);
3151
	if (entry->bytes == 0) {
3152
		rb_erase(&entry->offset_index, &cluster->root);
3153
		ctl->free_extents--;
3154
		if (entry->bitmap) {
3155 3156
			kmem_cache_free(btrfs_free_space_bitmap_cachep,
					entry->bitmap);
3157
			ctl->total_bitmaps--;
3158
			recalculate_thresholds(ctl);
3159 3160
		} else if (!btrfs_free_space_trimmed(entry)) {
			ctl->discardable_extents[BTRFS_STAT_CURR]--;
3161
		}
3162
		kmem_cache_free(btrfs_free_space_cachep, entry);
3163 3164
	}

3165
	spin_unlock(&cluster->lock);
3166
	spin_unlock(&ctl->tree_lock);
3167

3168 3169 3170
	return ret;
}

3171
static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
3172 3173
				struct btrfs_free_space *entry,
				struct btrfs_free_cluster *cluster,
3174 3175
				u64 offset, u64 bytes,
				u64 cont1_bytes, u64 min_bytes)
3176
{
3177
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3178 3179
	unsigned long next_zero;
	unsigned long i;
3180 3181
	unsigned long want_bits;
	unsigned long min_bits;
3182
	unsigned long found_bits;
3183
	unsigned long max_bits = 0;
3184 3185
	unsigned long start = 0;
	unsigned long total_found = 0;
3186
	int ret;
3187

3188
	i = offset_to_bit(entry->offset, ctl->unit,
3189
			  max_t(u64, offset, entry->offset));
3190 3191
	want_bits = bytes_to_bits(bytes, ctl->unit);
	min_bits = bytes_to_bits(min_bytes, ctl->unit);
3192

3193 3194 3195 3196 3197 3198 3199
	/*
	 * Don't bother looking for a cluster in this bitmap if it's heavily
	 * fragmented.
	 */
	if (entry->max_extent_size &&
	    entry->max_extent_size < cont1_bytes)
		return -ENOSPC;
3200 3201
again:
	found_bits = 0;
3202
	for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
3203 3204
		next_zero = find_next_zero_bit(entry->bitmap,
					       BITS_PER_BITMAP, i);
3205
		if (next_zero - i >= min_bits) {
3206
			found_bits = next_zero - i;
3207 3208
			if (found_bits > max_bits)
				max_bits = found_bits;
3209 3210
			break;
		}
3211 3212
		if (next_zero - i > max_bits)
			max_bits = next_zero - i;
3213 3214 3215
		i = next_zero;
	}

3216 3217
	if (!found_bits) {
		entry->max_extent_size = (u64)max_bits * ctl->unit;
3218
		return -ENOSPC;
3219
	}
3220

3221
	if (!total_found) {
3222
		start = i;
3223
		cluster->max_size = 0;
3224 3225 3226 3227
	}

	total_found += found_bits;

3228 3229
	if (cluster->max_size < found_bits * ctl->unit)
		cluster->max_size = found_bits * ctl->unit;
3230

3231 3232
	if (total_found < want_bits || cluster->max_size < cont1_bytes) {
		i = next_zero + 1;
3233 3234 3235
		goto again;
	}

3236
	cluster->window_start = start * ctl->unit + entry->offset;
3237
	rb_erase(&entry->offset_index, &ctl->free_space_offset);
3238 3239
	ret = tree_insert_offset(&cluster->root, entry->offset,
				 &entry->offset_index, 1);
3240
	ASSERT(!ret); /* -EEXIST; Logic error */
3241

J
Josef Bacik 已提交
3242
	trace_btrfs_setup_cluster(block_group, cluster,
3243
				  total_found * ctl->unit, 1);
3244 3245 3246
	return 0;
}

3247 3248
/*
 * This searches the block group for just extents to fill the cluster with.
3249 3250
 * Try to find a cluster with at least bytes total bytes, at least one
 * extent of cont1_bytes, and other clusters of at least min_bytes.
3251
 */
3252
static noinline int
3253
setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
3254 3255
			struct btrfs_free_cluster *cluster,
			struct list_head *bitmaps, u64 offset, u64 bytes,
3256
			u64 cont1_bytes, u64 min_bytes)
3257
{
3258
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3259 3260 3261 3262 3263 3264
	struct btrfs_free_space *first = NULL;
	struct btrfs_free_space *entry = NULL;
	struct btrfs_free_space *last;
	struct rb_node *node;
	u64 window_free;
	u64 max_extent;
J
Josef Bacik 已提交
3265
	u64 total_size = 0;
3266

3267
	entry = tree_search_offset(ctl, offset, 0, 1);
3268 3269 3270 3271 3272 3273 3274
	if (!entry)
		return -ENOSPC;

	/*
	 * We don't want bitmaps, so just move along until we find a normal
	 * extent entry.
	 */
3275 3276
	while (entry->bitmap || entry->bytes < min_bytes) {
		if (entry->bitmap && list_empty(&entry->list))
3277
			list_add_tail(&entry->list, bitmaps);
3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288
		node = rb_next(&entry->offset_index);
		if (!node)
			return -ENOSPC;
		entry = rb_entry(node, struct btrfs_free_space, offset_index);
	}

	window_free = entry->bytes;
	max_extent = entry->bytes;
	first = entry;
	last = entry;

3289 3290
	for (node = rb_next(&entry->offset_index); node;
	     node = rb_next(&entry->offset_index)) {
3291 3292
		entry = rb_entry(node, struct btrfs_free_space, offset_index);

3293 3294 3295
		if (entry->bitmap) {
			if (list_empty(&entry->list))
				list_add_tail(&entry->list, bitmaps);
3296
			continue;
3297 3298
		}

3299 3300 3301 3302 3303 3304
		if (entry->bytes < min_bytes)
			continue;

		last = entry;
		window_free += entry->bytes;
		if (entry->bytes > max_extent)
3305 3306 3307
			max_extent = entry->bytes;
	}

3308 3309 3310
	if (window_free < bytes || max_extent < cont1_bytes)
		return -ENOSPC;

3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323
	cluster->window_start = first->offset;

	node = &first->offset_index;

	/*
	 * now we've found our entries, pull them out of the free space
	 * cache and put them into the cluster rbtree
	 */
	do {
		int ret;

		entry = rb_entry(node, struct btrfs_free_space, offset_index);
		node = rb_next(&entry->offset_index);
3324
		if (entry->bitmap || entry->bytes < min_bytes)
3325 3326
			continue;

3327
		rb_erase(&entry->offset_index, &ctl->free_space_offset);
3328 3329
		ret = tree_insert_offset(&cluster->root, entry->offset,
					 &entry->offset_index, 0);
J
Josef Bacik 已提交
3330
		total_size += entry->bytes;
3331
		ASSERT(!ret); /* -EEXIST; Logic error */
3332 3333 3334
	} while (node && entry != last);

	cluster->max_size = max_extent;
J
Josef Bacik 已提交
3335
	trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
3336 3337 3338 3339 3340 3341 3342
	return 0;
}

/*
 * This specifically looks for bitmaps that may work in the cluster, we assume
 * that we have already failed to find extents that will work.
 */
3343
static noinline int
3344
setup_cluster_bitmap(struct btrfs_block_group *block_group,
3345 3346
		     struct btrfs_free_cluster *cluster,
		     struct list_head *bitmaps, u64 offset, u64 bytes,
3347
		     u64 cont1_bytes, u64 min_bytes)
3348
{
3349
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3350
	struct btrfs_free_space *entry = NULL;
3351
	int ret = -ENOSPC;
3352
	u64 bitmap_offset = offset_to_bitmap(ctl, offset);
3353

3354
	if (ctl->total_bitmaps == 0)
3355 3356
		return -ENOSPC;

3357 3358 3359 3360
	/*
	 * The bitmap that covers offset won't be in the list unless offset
	 * is just its start offset.
	 */
3361 3362 3363 3364
	if (!list_empty(bitmaps))
		entry = list_first_entry(bitmaps, struct btrfs_free_space, list);

	if (!entry || entry->offset != bitmap_offset) {
3365 3366 3367 3368 3369
		entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
		if (entry && list_empty(&entry->list))
			list_add(&entry->list, bitmaps);
	}

3370
	list_for_each_entry(entry, bitmaps, list) {
3371
		if (entry->bytes < bytes)
3372 3373
			continue;
		ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
3374
					   bytes, cont1_bytes, min_bytes);
3375 3376 3377 3378 3379
		if (!ret)
			return 0;
	}

	/*
3380 3381
	 * The bitmaps list has all the bitmaps that record free space
	 * starting after offset, so no more search is required.
3382
	 */
3383
	return -ENOSPC;
3384 3385
}

3386 3387
/*
 * here we try to find a cluster of blocks in a block group.  The goal
3388
 * is to find at least bytes+empty_size.
3389 3390 3391 3392 3393
 * We might not find them all in one contiguous area.
 *
 * returns zero and sets up cluster if things worked out, otherwise
 * it returns -enospc
 */
3394
int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
3395 3396 3397
			     struct btrfs_free_cluster *cluster,
			     u64 offset, u64 bytes, u64 empty_size)
{
3398
	struct btrfs_fs_info *fs_info = block_group->fs_info;
3399
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3400
	struct btrfs_free_space *entry, *tmp;
3401
	LIST_HEAD(bitmaps);
3402
	u64 min_bytes;
3403
	u64 cont1_bytes;
3404 3405
	int ret;

3406 3407 3408 3409 3410 3411
	/*
	 * Choose the minimum extent size we'll require for this
	 * cluster.  For SSD_SPREAD, don't allow any fragmentation.
	 * For metadata, allow allocates with smaller extents.  For
	 * data, keep it dense.
	 */
3412
	if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
3413
		cont1_bytes = min_bytes = bytes + empty_size;
3414
	} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
3415
		cont1_bytes = bytes;
3416
		min_bytes = fs_info->sectorsize;
3417 3418
	} else {
		cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
3419
		min_bytes = fs_info->sectorsize;
3420
	}
3421

3422
	spin_lock(&ctl->tree_lock);
3423 3424 3425 3426 3427

	/*
	 * If we know we don't have enough space to make a cluster don't even
	 * bother doing all the work to try and find one.
	 */
3428
	if (ctl->free_space < bytes) {
3429
		spin_unlock(&ctl->tree_lock);
3430 3431 3432
		return -ENOSPC;
	}

3433 3434 3435 3436 3437 3438 3439 3440
	spin_lock(&cluster->lock);

	/* someone already found a cluster, hooray */
	if (cluster->block_group) {
		ret = 0;
		goto out;
	}

J
Josef Bacik 已提交
3441 3442 3443
	trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
				 min_bytes);

3444
	ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
3445 3446
				      bytes + empty_size,
				      cont1_bytes, min_bytes);
3447
	if (ret)
3448
		ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
3449 3450
					   offset, bytes + empty_size,
					   cont1_bytes, min_bytes);
3451 3452 3453 3454

	/* Clear our temporary list */
	list_for_each_entry_safe(entry, tmp, &bitmaps, list)
		list_del_init(&entry->list);
3455

3456
	if (!ret) {
3457
		btrfs_get_block_group(block_group);
3458 3459 3460
		list_add_tail(&cluster->block_group_list,
			      &block_group->cluster_list);
		cluster->block_group = block_group;
J
Josef Bacik 已提交
3461 3462
	} else {
		trace_btrfs_failed_cluster_setup(block_group);
3463 3464 3465
	}
out:
	spin_unlock(&cluster->lock);
3466
	spin_unlock(&ctl->tree_lock);
3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477

	return ret;
}

/*
 * simple code to zero out a cluster
 */
void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
{
	spin_lock_init(&cluster->lock);
	spin_lock_init(&cluster->refill_lock);
3478
	cluster->root = RB_ROOT;
3479
	cluster->max_size = 0;
3480
	cluster->fragmented = false;
3481 3482 3483 3484
	INIT_LIST_HEAD(&cluster->block_group_list);
	cluster->block_group = NULL;
}

3485
static int do_trimming(struct btrfs_block_group *block_group,
3486
		       u64 *total_trimmed, u64 start, u64 bytes,
3487
		       u64 reserved_start, u64 reserved_bytes,
3488
		       enum btrfs_trim_state reserved_trim_state,
3489
		       struct btrfs_trim_range *trim_entry)
3490
{
3491
	struct btrfs_space_info *space_info = block_group->space_info;
3492
	struct btrfs_fs_info *fs_info = block_group->fs_info;
3493
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3494 3495
	int ret;
	int update = 0;
3496 3497 3498
	const u64 end = start + bytes;
	const u64 reserved_end = reserved_start + reserved_bytes;
	enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3499
	u64 trimmed = 0;
3500

3501 3502 3503 3504 3505 3506 3507 3508 3509 3510
	spin_lock(&space_info->lock);
	spin_lock(&block_group->lock);
	if (!block_group->ro) {
		block_group->reserved += reserved_bytes;
		space_info->bytes_reserved += reserved_bytes;
		update = 1;
	}
	spin_unlock(&block_group->lock);
	spin_unlock(&space_info->lock);

3511
	ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
3512
	if (!ret) {
3513
		*total_trimmed += trimmed;
3514 3515
		trim_state = BTRFS_TRIM_STATE_TRIMMED;
	}
3516

3517
	mutex_lock(&ctl->cache_writeout_mutex);
3518 3519 3520 3521 3522 3523 3524 3525
	if (reserved_start < start)
		__btrfs_add_free_space(fs_info, ctl, reserved_start,
				       start - reserved_start,
				       reserved_trim_state);
	if (start + bytes < reserved_start + reserved_bytes)
		__btrfs_add_free_space(fs_info, ctl, end, reserved_end - end,
				       reserved_trim_state);
	__btrfs_add_free_space(fs_info, ctl, start, bytes, trim_state);
3526 3527
	list_del(&trim_entry->list);
	mutex_unlock(&ctl->cache_writeout_mutex);
3528 3529 3530 3531 3532 3533 3534 3535 3536

	if (update) {
		spin_lock(&space_info->lock);
		spin_lock(&block_group->lock);
		if (block_group->ro)
			space_info->bytes_readonly += reserved_bytes;
		block_group->reserved -= reserved_bytes;
		space_info->bytes_reserved -= reserved_bytes;
		spin_unlock(&block_group->lock);
3537
		spin_unlock(&space_info->lock);
3538 3539 3540 3541 3542
	}

	return ret;
}

3543 3544 3545
/*
 * If @async is set, then we will trim 1 region and return.
 */
3546
static int trim_no_bitmap(struct btrfs_block_group *block_group,
3547 3548
			  u64 *total_trimmed, u64 start, u64 end, u64 minlen,
			  bool async)
3549
{
3550 3551
	struct btrfs_discard_ctl *discard_ctl =
					&block_group->fs_info->discard_ctl;
3552 3553 3554 3555 3556 3557
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
	struct btrfs_free_space *entry;
	struct rb_node *node;
	int ret = 0;
	u64 extent_start;
	u64 extent_bytes;
3558
	enum btrfs_trim_state extent_trim_state;
3559
	u64 bytes;
3560
	const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
3561 3562

	while (start < end) {
3563 3564 3565
		struct btrfs_trim_range trim_entry;

		mutex_lock(&ctl->cache_writeout_mutex);
3566
		spin_lock(&ctl->tree_lock);
3567

3568 3569
		if (ctl->free_space < minlen)
			goto out_unlock;
3570

3571
		entry = tree_search_offset(ctl, start, 0, 1);
3572 3573
		if (!entry)
			goto out_unlock;
3574

3575 3576 3577
		/* Skip bitmaps and if async, already trimmed entries */
		while (entry->bitmap ||
		       (async && btrfs_free_space_trimmed(entry))) {
3578
			node = rb_next(&entry->offset_index);
3579 3580
			if (!node)
				goto out_unlock;
3581 3582
			entry = rb_entry(node, struct btrfs_free_space,
					 offset_index);
3583 3584
		}

3585 3586
		if (entry->offset >= end)
			goto out_unlock;
3587

3588 3589
		extent_start = entry->offset;
		extent_bytes = entry->bytes;
3590
		extent_trim_state = entry->trim_state;
3591 3592 3593 3594 3595 3596 3597 3598 3599
		if (async) {
			start = entry->offset;
			bytes = entry->bytes;
			if (bytes < minlen) {
				spin_unlock(&ctl->tree_lock);
				mutex_unlock(&ctl->cache_writeout_mutex);
				goto next;
			}
			unlink_free_space(ctl, entry);
D
Dennis Zhou 已提交
3600 3601 3602 3603 3604 3605 3606 3607
			/*
			 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
			 * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim
			 * X when we come back around.  So trim it now.
			 */
			if (max_discard_size &&
			    bytes >= (max_discard_size +
				      BTRFS_ASYNC_DISCARD_MIN_FILTER)) {
3608 3609 3610 3611
				bytes = max_discard_size;
				extent_bytes = max_discard_size;
				entry->offset += max_discard_size;
				entry->bytes -= max_discard_size;
3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623
				link_free_space(ctl, entry);
			} else {
				kmem_cache_free(btrfs_free_space_cachep, entry);
			}
		} else {
			start = max(start, extent_start);
			bytes = min(extent_start + extent_bytes, end) - start;
			if (bytes < minlen) {
				spin_unlock(&ctl->tree_lock);
				mutex_unlock(&ctl->cache_writeout_mutex);
				goto next;
			}
3624

3625 3626 3627
			unlink_free_space(ctl, entry);
			kmem_cache_free(btrfs_free_space_cachep, entry);
		}
3628

3629
		spin_unlock(&ctl->tree_lock);
3630 3631 3632 3633
		trim_entry.start = extent_start;
		trim_entry.bytes = extent_bytes;
		list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
		mutex_unlock(&ctl->cache_writeout_mutex);
3634

3635
		ret = do_trimming(block_group, total_trimmed, start, bytes,
3636 3637
				  extent_start, extent_bytes, extent_trim_state,
				  &trim_entry);
3638 3639
		if (ret) {
			block_group->discard_cursor = start + bytes;
3640
			break;
3641
		}
3642 3643
next:
		start += bytes;
3644 3645 3646
		block_group->discard_cursor = start;
		if (async && *total_trimmed)
			break;
3647

3648 3649 3650 3651 3652 3653 3654
		if (fatal_signal_pending(current)) {
			ret = -ERESTARTSYS;
			break;
		}

		cond_resched();
	}
3655 3656 3657 3658 3659 3660 3661 3662

	return ret;

out_unlock:
	block_group->discard_cursor = btrfs_block_group_end(block_group);
	spin_unlock(&ctl->tree_lock);
	mutex_unlock(&ctl->cache_writeout_mutex);

3663 3664 3665
	return ret;
}

3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685
/*
 * If we break out of trimming a bitmap prematurely, we should reset the
 * trimming bit.  In a rather contrieved case, it's possible to race here so
 * reset the state to BTRFS_TRIM_STATE_UNTRIMMED.
 *
 * start = start of bitmap
 * end = near end of bitmap
 *
 * Thread 1:			Thread 2:
 * trim_bitmaps(start)
 *				trim_bitmaps(end)
 *				end_trimming_bitmap()
 * reset_trimming_bitmap()
 */
static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset)
{
	struct btrfs_free_space *entry;

	spin_lock(&ctl->tree_lock);
	entry = tree_search_offset(ctl, offset, 1, 0);
3686
	if (entry) {
3687
		if (btrfs_free_space_trimmed(entry)) {
3688 3689
			ctl->discardable_extents[BTRFS_STAT_CURR] +=
				entry->bitmap_extents;
3690 3691
			ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes;
		}
3692
		entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3693 3694
	}

3695 3696 3697
	spin_unlock(&ctl->tree_lock);
}

3698 3699
static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl,
				struct btrfs_free_space *entry)
3700
{
3701
	if (btrfs_free_space_trimming_bitmap(entry)) {
3702
		entry->trim_state = BTRFS_TRIM_STATE_TRIMMED;
3703 3704
		ctl->discardable_extents[BTRFS_STAT_CURR] -=
			entry->bitmap_extents;
3705
		ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes;
3706
	}
3707 3708
}

3709 3710 3711
/*
 * If @async is set, then we will trim 1 region and return.
 */
3712
static int trim_bitmaps(struct btrfs_block_group *block_group,
3713
			u64 *total_trimmed, u64 start, u64 end, u64 minlen,
D
Dennis Zhou 已提交
3714
			u64 maxlen, bool async)
3715
{
3716 3717
	struct btrfs_discard_ctl *discard_ctl =
					&block_group->fs_info->discard_ctl;
3718 3719 3720 3721 3722 3723
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
	struct btrfs_free_space *entry;
	int ret = 0;
	int ret2;
	u64 bytes;
	u64 offset = offset_to_bitmap(ctl, start);
3724
	const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
3725 3726 3727

	while (offset < end) {
		bool next_bitmap = false;
3728
		struct btrfs_trim_range trim_entry;
3729

3730
		mutex_lock(&ctl->cache_writeout_mutex);
3731 3732 3733
		spin_lock(&ctl->tree_lock);

		if (ctl->free_space < minlen) {
3734 3735
			block_group->discard_cursor =
				btrfs_block_group_end(block_group);
3736
			spin_unlock(&ctl->tree_lock);
3737
			mutex_unlock(&ctl->cache_writeout_mutex);
3738 3739 3740 3741
			break;
		}

		entry = tree_search_offset(ctl, offset, 1, 0);
D
Dennis Zhou 已提交
3742 3743 3744 3745 3746 3747 3748 3749 3750
		/*
		 * Bitmaps are marked trimmed lossily now to prevent constant
		 * discarding of the same bitmap (the reason why we are bound
		 * by the filters).  So, retrim the block group bitmaps when we
		 * are preparing to punt to the unused_bgs list.  This uses
		 * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED
		 * which is the only discard index which sets minlen to 0.
		 */
		if (!entry || (async && minlen && start == offset &&
3751
			       btrfs_free_space_trimmed(entry))) {
3752
			spin_unlock(&ctl->tree_lock);
3753
			mutex_unlock(&ctl->cache_writeout_mutex);
3754 3755 3756 3757
			next_bitmap = true;
			goto next;
		}

3758 3759 3760 3761 3762 3763 3764 3765 3766
		/*
		 * Async discard bitmap trimming begins at by setting the start
		 * to be key.objectid and the offset_to_bitmap() aligns to the
		 * start of the bitmap.  This lets us know we are fully
		 * scanning the bitmap rather than only some portion of it.
		 */
		if (start == offset)
			entry->trim_state = BTRFS_TRIM_STATE_TRIMMING;

3767
		bytes = minlen;
3768
		ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
3769
		if (ret2 || start >= end) {
3770
			/*
D
Dennis Zhou 已提交
3771 3772
			 * We lossily consider a bitmap trimmed if we only skip
			 * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER.
3773
			 */
D
Dennis Zhou 已提交
3774
			if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER)
3775
				end_trimming_bitmap(ctl, entry);
3776 3777
			else
				entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3778
			spin_unlock(&ctl->tree_lock);
3779
			mutex_unlock(&ctl->cache_writeout_mutex);
3780 3781 3782 3783
			next_bitmap = true;
			goto next;
		}

3784 3785 3786 3787 3788 3789 3790 3791 3792 3793
		/*
		 * We already trimmed a region, but are using the locking above
		 * to reset the trim_state.
		 */
		if (async && *total_trimmed) {
			spin_unlock(&ctl->tree_lock);
			mutex_unlock(&ctl->cache_writeout_mutex);
			goto out;
		}

3794
		bytes = min(bytes, end - start);
D
Dennis Zhou 已提交
3795
		if (bytes < minlen || (async && maxlen && bytes > maxlen)) {
3796
			spin_unlock(&ctl->tree_lock);
3797
			mutex_unlock(&ctl->cache_writeout_mutex);
3798 3799 3800
			goto next;
		}

D
Dennis Zhou 已提交
3801 3802 3803 3804 3805 3806 3807 3808 3809
		/*
		 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
		 * If X < @minlen, we won't trim X when we come back around.
		 * So trim it now.  We differ here from trimming extents as we
		 * don't keep individual state per bit.
		 */
		if (async &&
		    max_discard_size &&
		    bytes > (max_discard_size + minlen))
3810
			bytes = max_discard_size;
3811

3812 3813 3814 3815 3816
		bitmap_clear_bits(ctl, entry, start, bytes);
		if (entry->bytes == 0)
			free_bitmap(ctl, entry);

		spin_unlock(&ctl->tree_lock);
3817 3818 3819 3820
		trim_entry.start = start;
		trim_entry.bytes = bytes;
		list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
		mutex_unlock(&ctl->cache_writeout_mutex);
3821 3822

		ret = do_trimming(block_group, total_trimmed, start, bytes,
3823
				  start, bytes, 0, &trim_entry);
3824 3825
		if (ret) {
			reset_trimming_bitmap(ctl, offset);
3826 3827
			block_group->discard_cursor =
				btrfs_block_group_end(block_group);
3828
			break;
3829
		}
3830 3831 3832
next:
		if (next_bitmap) {
			offset += BITS_PER_BITMAP * ctl->unit;
3833
			start = offset;
3834 3835
		} else {
			start += bytes;
3836
		}
3837
		block_group->discard_cursor = start;
3838 3839

		if (fatal_signal_pending(current)) {
3840 3841
			if (start != offset)
				reset_trimming_bitmap(ctl, offset);
3842 3843 3844 3845 3846 3847 3848
			ret = -ERESTARTSYS;
			break;
		}

		cond_resched();
	}

3849 3850 3851 3852
	if (offset >= end)
		block_group->discard_cursor = end;

out:
3853 3854
	return ret;
}
3855

3856
int btrfs_trim_block_group(struct btrfs_block_group *block_group,
3857 3858
			   u64 *trimmed, u64 start, u64 end, u64 minlen)
{
3859
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3860
	int ret;
3861
	u64 rem = 0;
3862

3863 3864
	ASSERT(!btrfs_is_zoned(block_group->fs_info));

3865 3866 3867 3868
	*trimmed = 0;

	spin_lock(&block_group->lock);
	if (block_group->removed) {
3869
		spin_unlock(&block_group->lock);
3870
		return 0;
3871
	}
3872
	btrfs_freeze_block_group(block_group);
3873 3874
	spin_unlock(&block_group->lock);

3875
	ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false);
3876 3877
	if (ret)
		goto out;
3878

D
Dennis Zhou 已提交
3879
	ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false);
3880 3881 3882 3883
	div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem);
	/* If we ended in the middle of a bitmap, reset the trimming flag */
	if (rem)
		reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end));
3884
out:
3885
	btrfs_unfreeze_block_group(block_group);
3886 3887 3888
	return ret;
}

3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901
int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
				   u64 *trimmed, u64 start, u64 end, u64 minlen,
				   bool async)
{
	int ret;

	*trimmed = 0;

	spin_lock(&block_group->lock);
	if (block_group->removed) {
		spin_unlock(&block_group->lock);
		return 0;
	}
3902
	btrfs_freeze_block_group(block_group);
3903 3904 3905
	spin_unlock(&block_group->lock);

	ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async);
3906
	btrfs_unfreeze_block_group(block_group);
3907 3908 3909 3910 3911 3912

	return ret;
}

int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
				   u64 *trimmed, u64 start, u64 end, u64 minlen,
D
Dennis Zhou 已提交
3913
				   u64 maxlen, bool async)
3914 3915 3916 3917 3918 3919 3920 3921 3922 3923
{
	int ret;

	*trimmed = 0;

	spin_lock(&block_group->lock);
	if (block_group->removed) {
		spin_unlock(&block_group->lock);
		return 0;
	}
3924
	btrfs_freeze_block_group(block_group);
3925 3926
	spin_unlock(&block_group->lock);

D
Dennis Zhou 已提交
3927 3928 3929
	ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen,
			   async);

3930
	btrfs_unfreeze_block_group(block_group);
3931 3932 3933 3934

	return ret;
}

3935 3936 3937 3938 3939
bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info)
{
	return btrfs_super_cache_generation(fs_info->super_copy);
}

3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960
static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info,
				       struct btrfs_trans_handle *trans)
{
	struct btrfs_block_group *block_group;
	struct rb_node *node;
	int ret;

	btrfs_info(fs_info, "cleaning free space cache v1");

	node = rb_first(&fs_info->block_group_cache_tree);
	while (node) {
		block_group = rb_entry(node, struct btrfs_block_group, cache_node);
		ret = btrfs_remove_free_space_inode(trans, NULL, block_group);
		if (ret)
			goto out;
		node = rb_next(node);
	}
out:
	return ret;
}

3961 3962 3963 3964 3965 3966
int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active)
{
	struct btrfs_trans_handle *trans;
	int ret;

	/*
3967 3968 3969 3970 3971 3972
	 * update_super_roots will appropriately set or unset
	 * super_copy->cache_generation based on SPACE_CACHE and
	 * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a
	 * transaction commit whether we are enabling space cache v1 and don't
	 * have any other work to do, or are disabling it and removing free
	 * space inodes.
3973 3974 3975 3976 3977
	 */
	trans = btrfs_start_transaction(fs_info->tree_root, 0);
	if (IS_ERR(trans))
		return PTR_ERR(trans);

3978
	if (!active) {
3979
		set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
3980 3981 3982 3983 3984 3985 3986
		ret = cleanup_free_space_cache_v1(fs_info, trans);
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			btrfs_end_transaction(trans);
			goto out;
		}
	}
3987 3988

	ret = btrfs_commit_transaction(trans);
3989
out:
3990 3991 3992 3993 3994
	clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);

	return ret;
}

3995
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
3996 3997 3998 3999 4000 4001
/*
 * Use this if you need to make a bitmap or extent entry specifically, it
 * doesn't do any of the merging that add_free_space does, this acts a lot like
 * how the free space cache loading stuff works, so you can get really weird
 * configurations.
 */
4002
int test_add_free_space_entry(struct btrfs_block_group *cache,
4003
			      u64 offset, u64 bytes, bool bitmap)
4004
{
4005 4006 4007
	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
	struct btrfs_free_space *info = NULL, *bitmap_info;
	void *map = NULL;
4008
	enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED;
4009 4010
	u64 bytes_added;
	int ret;
4011

4012 4013 4014 4015 4016
again:
	if (!info) {
		info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
		if (!info)
			return -ENOMEM;
4017 4018
	}

4019 4020 4021 4022
	if (!bitmap) {
		spin_lock(&ctl->tree_lock);
		info->offset = offset;
		info->bytes = bytes;
4023
		info->max_extent_size = 0;
4024 4025 4026 4027 4028 4029 4030 4031
		ret = link_free_space(ctl, info);
		spin_unlock(&ctl->tree_lock);
		if (ret)
			kmem_cache_free(btrfs_free_space_cachep, info);
		return ret;
	}

	if (!map) {
4032
		map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS);
4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046
		if (!map) {
			kmem_cache_free(btrfs_free_space_cachep, info);
			return -ENOMEM;
		}
	}

	spin_lock(&ctl->tree_lock);
	bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
					 1, 0);
	if (!bitmap_info) {
		info->bitmap = map;
		map = NULL;
		add_new_bitmap(ctl, info, offset);
		bitmap_info = info;
4047
		info = NULL;
4048
	}
4049

4050 4051
	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
					  trim_state);
4052

4053 4054 4055
	bytes -= bytes_added;
	offset += bytes_added;
	spin_unlock(&ctl->tree_lock);
4056

4057 4058
	if (bytes)
		goto again;
4059

4060 4061
	if (info)
		kmem_cache_free(btrfs_free_space_cachep, info);
4062 4063
	if (map)
		kmem_cache_free(btrfs_free_space_bitmap_cachep, map);
4064
	return 0;
4065 4066 4067 4068 4069 4070 4071
}

/*
 * Checks to see if the given range is in the free space cache.  This is really
 * just used to check the absence of space, so if there is free space in the
 * range at all we will return 1.
 */
4072
int test_check_exists(struct btrfs_block_group *cache,
4073
		      u64 offset, u64 bytes)
4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095
{
	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
	struct btrfs_free_space *info;
	int ret = 0;

	spin_lock(&ctl->tree_lock);
	info = tree_search_offset(ctl, offset, 0, 0);
	if (!info) {
		info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
					  1, 0);
		if (!info)
			goto out;
	}

have_info:
	if (info->bitmap) {
		u64 bit_off, bit_bytes;
		struct rb_node *n;
		struct btrfs_free_space *tmp;

		bit_off = offset;
		bit_bytes = ctl->unit;
4096
		ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114
		if (!ret) {
			if (bit_off == offset) {
				ret = 1;
				goto out;
			} else if (bit_off > offset &&
				   offset + bytes > bit_off) {
				ret = 1;
				goto out;
			}
		}

		n = rb_prev(&info->offset_index);
		while (n) {
			tmp = rb_entry(n, struct btrfs_free_space,
				       offset_index);
			if (tmp->offset + tmp->bytes < offset)
				break;
			if (offset + bytes < tmp->offset) {
4115
				n = rb_prev(&tmp->offset_index);
4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128
				continue;
			}
			info = tmp;
			goto have_info;
		}

		n = rb_next(&info->offset_index);
		while (n) {
			tmp = rb_entry(n, struct btrfs_free_space,
				       offset_index);
			if (offset + bytes < tmp->offset)
				break;
			if (tmp->offset + tmp->bytes < offset) {
4129
				n = rb_next(&tmp->offset_index);
4130 4131 4132 4133 4134 4135
				continue;
			}
			info = tmp;
			goto have_info;
		}

4136
		ret = 0;
4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150
		goto out;
	}

	if (info->offset == offset) {
		ret = 1;
		goto out;
	}

	if (offset > info->offset && offset < info->offset + info->bytes)
		ret = 1;
out:
	spin_unlock(&ctl->tree_lock);
	return ret;
}
4151
#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */