free-space-cache.c 104.5 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
J
Josef Bacik 已提交
2 3 4 5
/*
 * Copyright (C) 2008 Red Hat.  All rights reserved.
 */

6
#include <linux/pagemap.h>
J
Josef Bacik 已提交
7
#include <linux/sched.h>
8
#include <linux/sched/signal.h>
9
#include <linux/slab.h>
10
#include <linux/math64.h>
11
#include <linux/ratelimit.h>
12
#include <linux/error-injection.h>
13
#include <linux/sched/mm.h>
J
Josef Bacik 已提交
14
#include "ctree.h"
15 16
#include "free-space-cache.h"
#include "transaction.h"
17
#include "disk-io.h"
18
#include "extent_io.h"
19
#include "volumes.h"
20
#include "space-info.h"
21
#include "delalloc-space.h"
22
#include "block-group.h"
23
#include "discard.h"
24

25
#define BITS_PER_BITMAP		(PAGE_SIZE * 8UL)
26 27
#define MAX_CACHE_BYTES_PER_GIG	SZ_64K
#define FORCE_EXTENT_THRESHOLD	SZ_1M
J
Josef Bacik 已提交
28

29 30 31 32 33 34
struct btrfs_trim_range {
	u64 start;
	u64 bytes;
	struct list_head list;
};

35
static int link_free_space(struct btrfs_free_space_ctl *ctl,
J
Josef Bacik 已提交
36
			   struct btrfs_free_space *info);
37 38
static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
			      struct btrfs_free_space *info);
39 40 41 42 43 44 45 46
static int search_bitmap(struct btrfs_free_space_ctl *ctl,
			 struct btrfs_free_space *bitmap_info, u64 *offset,
			 u64 *bytes, bool for_alloc);
static void free_bitmap(struct btrfs_free_space_ctl *ctl,
			struct btrfs_free_space *bitmap_info);
static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
			      struct btrfs_free_space *info, u64 offset,
			      u64 bytes);
J
Josef Bacik 已提交
47

48 49 50
static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
					       struct btrfs_path *path,
					       u64 offset)
51
{
52
	struct btrfs_fs_info *fs_info = root->fs_info;
53 54 55 56 57 58
	struct btrfs_key key;
	struct btrfs_key location;
	struct btrfs_disk_key disk_key;
	struct btrfs_free_space_header *header;
	struct extent_buffer *leaf;
	struct inode *inode = NULL;
59
	unsigned nofs_flag;
60 61 62
	int ret;

	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
63
	key.offset = offset;
64 65 66 67 68 69
	key.type = 0;

	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0)
		return ERR_PTR(ret);
	if (ret > 0) {
70
		btrfs_release_path(path);
71 72 73 74 75 76 77 78
		return ERR_PTR(-ENOENT);
	}

	leaf = path->nodes[0];
	header = btrfs_item_ptr(leaf, path->slots[0],
				struct btrfs_free_space_header);
	btrfs_free_space_key(leaf, header, &disk_key);
	btrfs_disk_key_to_cpu(&location, &disk_key);
79
	btrfs_release_path(path);
80

81 82 83 84 85
	/*
	 * We are often under a trans handle at this point, so we need to make
	 * sure NOFS is set to keep us from deadlocking.
	 */
	nofs_flag = memalloc_nofs_save();
D
David Sterba 已提交
86
	inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path);
87
	btrfs_release_path(path);
88
	memalloc_nofs_restore(nofs_flag);
89 90 91
	if (IS_ERR(inode))
		return inode;

A
Al Viro 已提交
92
	mapping_set_gfp_mask(inode->i_mapping,
93 94
			mapping_gfp_constraint(inode->i_mapping,
			~(__GFP_FS | __GFP_HIGHMEM)));
95

96 97 98
	return inode;
}

99
struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
100
		struct btrfs_path *path)
101
{
102
	struct btrfs_fs_info *fs_info = block_group->fs_info;
103
	struct inode *inode = NULL;
104
	u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
105 106 107 108 109 110 111 112

	spin_lock(&block_group->lock);
	if (block_group->inode)
		inode = igrab(block_group->inode);
	spin_unlock(&block_group->lock);
	if (inode)
		return inode;

113
	inode = __lookup_free_space_inode(fs_info->tree_root, path,
114
					  block_group->start);
115 116 117
	if (IS_ERR(inode))
		return inode;

118
	spin_lock(&block_group->lock);
119
	if (!((BTRFS_I(inode)->flags & flags) == flags)) {
120
		btrfs_info(fs_info, "Old style space inode found, converting.");
121 122
		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
			BTRFS_INODE_NODATACOW;
123 124 125
		block_group->disk_cache_state = BTRFS_DC_CLEAR;
	}

126
	if (!block_group->iref) {
127 128 129 130 131 132 133 134
		block_group->inode = igrab(inode);
		block_group->iref = 1;
	}
	spin_unlock(&block_group->lock);

	return inode;
}

135 136 137 138
static int __create_free_space_inode(struct btrfs_root *root,
				     struct btrfs_trans_handle *trans,
				     struct btrfs_path *path,
				     u64 ino, u64 offset)
139 140 141 142 143 144
{
	struct btrfs_key key;
	struct btrfs_disk_key disk_key;
	struct btrfs_free_space_header *header;
	struct btrfs_inode_item *inode_item;
	struct extent_buffer *leaf;
145 146 147
	/* We inline CRCs for the free disk space cache */
	const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC |
			  BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
148 149
	int ret;

150
	ret = btrfs_insert_empty_inode(trans, root, path, ino);
151 152 153 154 155 156 157
	if (ret)
		return ret;

	leaf = path->nodes[0];
	inode_item = btrfs_item_ptr(leaf, path->slots[0],
				    struct btrfs_inode_item);
	btrfs_item_key(leaf, &disk_key, path->slots[0]);
158
	memzero_extent_buffer(leaf, (unsigned long)inode_item,
159 160 161 162 163 164 165
			     sizeof(*inode_item));
	btrfs_set_inode_generation(leaf, inode_item, trans->transid);
	btrfs_set_inode_size(leaf, inode_item, 0);
	btrfs_set_inode_nbytes(leaf, inode_item, 0);
	btrfs_set_inode_uid(leaf, inode_item, 0);
	btrfs_set_inode_gid(leaf, inode_item, 0);
	btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
166
	btrfs_set_inode_flags(leaf, inode_item, flags);
167 168
	btrfs_set_inode_nlink(leaf, inode_item, 1);
	btrfs_set_inode_transid(leaf, inode_item, trans->transid);
169
	btrfs_set_inode_block_group(leaf, inode_item, offset);
170
	btrfs_mark_buffer_dirty(leaf);
171
	btrfs_release_path(path);
172 173

	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
174
	key.offset = offset;
175 176 177 178
	key.type = 0;
	ret = btrfs_insert_empty_item(trans, root, path, &key,
				      sizeof(struct btrfs_free_space_header));
	if (ret < 0) {
179
		btrfs_release_path(path);
180 181
		return ret;
	}
182

183 184 185
	leaf = path->nodes[0];
	header = btrfs_item_ptr(leaf, path->slots[0],
				struct btrfs_free_space_header);
186
	memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
187 188
	btrfs_set_free_space_key(leaf, header, &disk_key);
	btrfs_mark_buffer_dirty(leaf);
189
	btrfs_release_path(path);
190 191 192 193

	return 0;
}

194
int create_free_space_inode(struct btrfs_trans_handle *trans,
195
			    struct btrfs_block_group *block_group,
196 197 198 199 200
			    struct btrfs_path *path)
{
	int ret;
	u64 ino;

201
	ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino);
202 203 204
	if (ret < 0)
		return ret;

205
	return __create_free_space_inode(trans->fs_info->tree_root, trans, path,
206
					 ino, block_group->start);
207 208
}

209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267
/*
 * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode
 * handles lookup, otherwise it takes ownership and iputs the inode.
 * Don't reuse an inode pointer after passing it into this function.
 */
int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
				  struct inode *inode,
				  struct btrfs_block_group *block_group)
{
	struct btrfs_path *path;
	struct btrfs_key key;
	int ret = 0;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	if (!inode)
		inode = lookup_free_space_inode(block_group, path);
	if (IS_ERR(inode)) {
		if (PTR_ERR(inode) != -ENOENT)
			ret = PTR_ERR(inode);
		goto out;
	}
	ret = btrfs_orphan_add(trans, BTRFS_I(inode));
	if (ret) {
		btrfs_add_delayed_iput(inode);
		goto out;
	}
	clear_nlink(inode);
	/* One for the block groups ref */
	spin_lock(&block_group->lock);
	if (block_group->iref) {
		block_group->iref = 0;
		block_group->inode = NULL;
		spin_unlock(&block_group->lock);
		iput(inode);
	} else {
		spin_unlock(&block_group->lock);
	}
	/* One for the lookup ref */
	btrfs_add_delayed_iput(inode);

	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
	key.type = 0;
	key.offset = block_group->start;
	ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path,
				-1, 1);
	if (ret) {
		if (ret > 0)
			ret = 0;
		goto out;
	}
	ret = btrfs_del_item(trans, trans->fs_info->tree_root, path);
out:
	btrfs_free_path(path);
	return ret;
}

268
int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
269
				       struct btrfs_block_rsv *rsv)
270
{
271
	u64 needed_bytes;
272
	int ret;
273 274

	/* 1 for slack space, 1 for updating the inode */
275 276
	needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) +
		btrfs_calc_metadata_size(fs_info, 1);
277

278 279 280 281 282 283
	spin_lock(&rsv->lock);
	if (rsv->reserved < needed_bytes)
		ret = -ENOSPC;
	else
		ret = 0;
	spin_unlock(&rsv->lock);
284
	return ret;
285 286
}

287
int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
288
				    struct btrfs_block_group *block_group,
289 290
				    struct inode *inode)
{
291
	struct btrfs_root *root = BTRFS_I(inode)->root;
292
	int ret = 0;
293
	bool locked = false;
294 295

	if (block_group) {
296 297 298 299 300 301
		struct btrfs_path *path = btrfs_alloc_path();

		if (!path) {
			ret = -ENOMEM;
			goto fail;
		}
302
		locked = true;
303 304 305 306
		mutex_lock(&trans->transaction->cache_write_mutex);
		if (!list_empty(&block_group->io_list)) {
			list_del_init(&block_group->io_list);

307
			btrfs_wait_cache_io(trans, block_group, path);
308 309 310 311 312 313 314 315 316 317
			btrfs_put_block_group(block_group);
		}

		/*
		 * now that we've truncated the cache away, its no longer
		 * setup or written
		 */
		spin_lock(&block_group->lock);
		block_group->disk_cache_state = BTRFS_DC_CLEAR;
		spin_unlock(&block_group->lock);
318
		btrfs_free_path(path);
319
	}
320

321
	btrfs_i_size_write(BTRFS_I(inode), 0);
322
	truncate_pagecache(inode, 0);
323 324

	/*
325 326
	 * We skip the throttling logic for free space cache inodes, so we don't
	 * need to check for -EAGAIN.
327
	 */
328
	ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
329
					 0, BTRFS_EXTENT_DATA_KEY);
330 331
	if (ret)
		goto fail;
332

333
	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
334 335

fail:
336 337
	if (locked)
		mutex_unlock(&trans->transaction->cache_write_mutex);
338
	if (ret)
339
		btrfs_abort_transaction(trans, ret);
340

341
	return ret;
342 343
}

344
static void readahead_cache(struct inode *inode)
345 346 347 348 349 350
{
	struct file_ra_state *ra;
	unsigned long last_index;

	ra = kzalloc(sizeof(*ra), GFP_NOFS);
	if (!ra)
351
		return;
352 353

	file_ra_state_init(ra, inode->i_mapping);
354
	last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
355 356 357 358 359 360

	page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);

	kfree(ra);
}

361
static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
362
		       int write)
363
{
364 365
	int num_pages;

366
	num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
367

368
	/* Make sure we can fit our crcs and generation into the first page */
369
	if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
370 371
		return -ENOSPC;

372
	memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
373

374
	io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
375 376
	if (!io_ctl->pages)
		return -ENOMEM;
377 378

	io_ctl->num_pages = num_pages;
379
	io_ctl->fs_info = btrfs_sb(inode->i_sb);
380
	io_ctl->inode = inode;
381

382 383
	return 0;
}
384
ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO);
385

386
static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
387 388
{
	kfree(io_ctl->pages);
389
	io_ctl->pages = NULL;
390 391
}

392
static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
393 394 395 396 397 398 399
{
	if (io_ctl->cur) {
		io_ctl->cur = NULL;
		io_ctl->orig = NULL;
	}
}

400
static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
401
{
402
	ASSERT(io_ctl->index < io_ctl->num_pages);
403
	io_ctl->page = io_ctl->pages[io_ctl->index++];
404
	io_ctl->cur = page_address(io_ctl->page);
405
	io_ctl->orig = io_ctl->cur;
406
	io_ctl->size = PAGE_SIZE;
407
	if (clear)
408
		clear_page(io_ctl->cur);
409 410
}

411
static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
412 413 414 415 416 417
{
	int i;

	io_ctl_unmap_page(io_ctl);

	for (i = 0; i < io_ctl->num_pages; i++) {
418 419 420
		if (io_ctl->pages[i]) {
			ClearPageChecked(io_ctl->pages[i]);
			unlock_page(io_ctl->pages[i]);
421
			put_page(io_ctl->pages[i]);
422
		}
423 424 425
	}
}

426
static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
427 428
{
	struct page *page;
429
	struct inode *inode = io_ctl->inode;
430 431 432 433 434 435 436 437 438 439 440 441 442
	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
	int i;

	for (i = 0; i < io_ctl->num_pages; i++) {
		page = find_or_create_page(inode->i_mapping, i, mask);
		if (!page) {
			io_ctl_drop_pages(io_ctl);
			return -ENOMEM;
		}
		io_ctl->pages[i] = page;
		if (uptodate && !PageUptodate(page)) {
			btrfs_readpage(NULL, page);
			lock_page(page);
443 444 445 446 447 448
			if (page->mapping != inode->i_mapping) {
				btrfs_err(BTRFS_I(inode)->root->fs_info,
					  "free space cache page truncated");
				io_ctl_drop_pages(io_ctl);
				return -EIO;
			}
449
			if (!PageUptodate(page)) {
450 451
				btrfs_err(BTRFS_I(inode)->root->fs_info,
					   "error reading free space cache");
452 453 454 455 456 457
				io_ctl_drop_pages(io_ctl);
				return -EIO;
			}
		}
	}

458 459 460 461 462
	for (i = 0; i < io_ctl->num_pages; i++) {
		clear_page_dirty_for_io(io_ctl->pages[i]);
		set_page_extent_mapped(io_ctl->pages[i]);
	}

463 464 465
	return 0;
}

466
static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
467 468 469 470
{
	io_ctl_map_page(io_ctl, 1);

	/*
471 472
	 * Skip the csum areas.  If we don't check crcs then we just have a
	 * 64bit chunk at the front of the first page.
473
	 */
474 475
	io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
	io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
476

477
	put_unaligned_le64(generation, io_ctl->cur);
478 479 480
	io_ctl->cur += sizeof(u64);
}

481
static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
482
{
483
	u64 cache_gen;
484

485 486 487 488
	/*
	 * Skip the crc area.  If we don't check crcs then we just have a 64bit
	 * chunk at the front of the first page.
	 */
489 490
	io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
	io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
491

492 493
	cache_gen = get_unaligned_le64(io_ctl->cur);
	if (cache_gen != generation) {
494
		btrfs_err_rl(io_ctl->fs_info,
495
			"space cache generation (%llu) does not match inode (%llu)",
496
				cache_gen, generation);
497 498 499 500
		io_ctl_unmap_page(io_ctl);
		return -EIO;
	}
	io_ctl->cur += sizeof(u64);
501 502 503
	return 0;
}

504
static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
505 506 507 508 509 510
{
	u32 *tmp;
	u32 crc = ~(u32)0;
	unsigned offset = 0;

	if (index == 0)
511
		offset = sizeof(u32) * io_ctl->num_pages;
512

513 514
	crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
	btrfs_crc32c_final(crc, (u8 *)&crc);
515
	io_ctl_unmap_page(io_ctl);
516
	tmp = page_address(io_ctl->pages[0]);
517 518 519 520
	tmp += index;
	*tmp = crc;
}

521
static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
522 523 524 525 526 527 528 529
{
	u32 *tmp, val;
	u32 crc = ~(u32)0;
	unsigned offset = 0;

	if (index == 0)
		offset = sizeof(u32) * io_ctl->num_pages;

530
	tmp = page_address(io_ctl->pages[0]);
531 532 533 534
	tmp += index;
	val = *tmp;

	io_ctl_map_page(io_ctl, 0);
535 536
	crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
	btrfs_crc32c_final(crc, (u8 *)&crc);
537
	if (val != crc) {
538
		btrfs_err_rl(io_ctl->fs_info,
539
			"csum mismatch on free space cache");
540 541 542 543
		io_ctl_unmap_page(io_ctl);
		return -EIO;
	}

544 545 546
	return 0;
}

547
static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
548 549 550 551 552 553 554 555
			    void *bitmap)
{
	struct btrfs_free_space_entry *entry;

	if (!io_ctl->cur)
		return -ENOSPC;

	entry = io_ctl->cur;
556 557
	put_unaligned_le64(offset, &entry->offset);
	put_unaligned_le64(bytes, &entry->bytes);
558 559 560 561 562 563 564 565
	entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
		BTRFS_FREE_SPACE_EXTENT;
	io_ctl->cur += sizeof(struct btrfs_free_space_entry);
	io_ctl->size -= sizeof(struct btrfs_free_space_entry);

	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
		return 0;

566
	io_ctl_set_crc(io_ctl, io_ctl->index - 1);
567 568 569 570 571 572 573 574 575 576

	/* No more pages to map */
	if (io_ctl->index >= io_ctl->num_pages)
		return 0;

	/* map the next page */
	io_ctl_map_page(io_ctl, 1);
	return 0;
}

577
static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
578 579 580 581 582 583 584 585 586
{
	if (!io_ctl->cur)
		return -ENOSPC;

	/*
	 * If we aren't at the start of the current page, unmap this one and
	 * map the next one if there is any left.
	 */
	if (io_ctl->cur != io_ctl->orig) {
587
		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
588 589 590 591 592
		if (io_ctl->index >= io_ctl->num_pages)
			return -ENOSPC;
		io_ctl_map_page(io_ctl, 0);
	}

593
	copy_page(io_ctl->cur, bitmap);
594
	io_ctl_set_crc(io_ctl, io_ctl->index - 1);
595 596 597 598 599
	if (io_ctl->index < io_ctl->num_pages)
		io_ctl_map_page(io_ctl, 0);
	return 0;
}

600
static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
601
{
602 603 604 605 606 607 608 609
	/*
	 * If we're not on the boundary we know we've modified the page and we
	 * need to crc the page.
	 */
	if (io_ctl->cur != io_ctl->orig)
		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
	else
		io_ctl_unmap_page(io_ctl);
610 611 612

	while (io_ctl->index < io_ctl->num_pages) {
		io_ctl_map_page(io_ctl, 1);
613
		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
614 615 616
	}
}

617
static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
618
			    struct btrfs_free_space *entry, u8 *type)
619 620
{
	struct btrfs_free_space_entry *e;
621 622 623 624 625 626 627
	int ret;

	if (!io_ctl->cur) {
		ret = io_ctl_check_crc(io_ctl, io_ctl->index);
		if (ret)
			return ret;
	}
628 629

	e = io_ctl->cur;
630 631
	entry->offset = get_unaligned_le64(&e->offset);
	entry->bytes = get_unaligned_le64(&e->bytes);
632
	*type = e->type;
633 634 635 636
	io_ctl->cur += sizeof(struct btrfs_free_space_entry);
	io_ctl->size -= sizeof(struct btrfs_free_space_entry);

	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
637
		return 0;
638 639 640

	io_ctl_unmap_page(io_ctl);

641
	return 0;
642 643
}

644
static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
645
			      struct btrfs_free_space *entry)
646
{
647 648 649 650 651 652
	int ret;

	ret = io_ctl_check_crc(io_ctl, io_ctl->index);
	if (ret)
		return ret;

653
	copy_page(entry->bitmap, io_ctl->cur);
654
	io_ctl_unmap_page(io_ctl);
655 656

	return 0;
657 658
}

659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696
static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
{
	struct btrfs_block_group *block_group = ctl->private;
	u64 max_bytes;
	u64 bitmap_bytes;
	u64 extent_bytes;
	u64 size = block_group->length;
	u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
	u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);

	max_bitmaps = max_t(u64, max_bitmaps, 1);

	ASSERT(ctl->total_bitmaps <= max_bitmaps);

	/*
	 * We are trying to keep the total amount of memory used per 1GiB of
	 * space to be MAX_CACHE_BYTES_PER_GIG.  However, with a reclamation
	 * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of
	 * bitmaps, we may end up using more memory than this.
	 */
	if (size < SZ_1G)
		max_bytes = MAX_CACHE_BYTES_PER_GIG;
	else
		max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);

	bitmap_bytes = ctl->total_bitmaps * ctl->unit;

	/*
	 * we want the extent entry threshold to always be at most 1/2 the max
	 * bytes we can have, or whatever is less than that.
	 */
	extent_bytes = max_bytes - bitmap_bytes;
	extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);

	ctl->extents_thresh =
		div_u64(extent_bytes, sizeof(struct btrfs_free_space));
}

697 698 699
static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
				   struct btrfs_free_space_ctl *ctl,
				   struct btrfs_path *path, u64 offset)
700
{
701
	struct btrfs_fs_info *fs_info = root->fs_info;
702 703
	struct btrfs_free_space_header *header;
	struct extent_buffer *leaf;
704
	struct btrfs_io_ctl io_ctl;
705
	struct btrfs_key key;
706
	struct btrfs_free_space *e, *n;
707
	LIST_HEAD(bitmaps);
708 709 710
	u64 num_entries;
	u64 num_bitmaps;
	u64 generation;
711
	u8 type;
712
	int ret = 0;
713 714

	/* Nothing in the space cache, goodbye */
715
	if (!i_size_read(inode))
716
		return 0;
717 718

	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
719
	key.offset = offset;
720 721 722
	key.type = 0;

	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
723
	if (ret < 0)
724
		return 0;
725
	else if (ret > 0) {
726
		btrfs_release_path(path);
727
		return 0;
728 729
	}

730 731
	ret = -1;

732 733 734 735 736 737
	leaf = path->nodes[0];
	header = btrfs_item_ptr(leaf, path->slots[0],
				struct btrfs_free_space_header);
	num_entries = btrfs_free_space_entries(leaf, header);
	num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
	generation = btrfs_free_space_generation(leaf, header);
738
	btrfs_release_path(path);
739

740
	if (!BTRFS_I(inode)->generation) {
741
		btrfs_info(fs_info,
742
			   "the free space cache file (%llu) is invalid, skip it",
743 744 745 746
			   offset);
		return 0;
	}

747
	if (BTRFS_I(inode)->generation != generation) {
748 749 750
		btrfs_err(fs_info,
			  "free space inode generation (%llu) did not match free space cache generation (%llu)",
			  BTRFS_I(inode)->generation, generation);
751
		return 0;
752 753 754
	}

	if (!num_entries)
755
		return 0;
756

757
	ret = io_ctl_init(&io_ctl, inode, 0);
758 759 760
	if (ret)
		return ret;

761
	readahead_cache(inode);
762

763
	ret = io_ctl_prepare_pages(&io_ctl, true);
764 765
	if (ret)
		goto out;
766

767 768 769 770
	ret = io_ctl_check_crc(&io_ctl, 0);
	if (ret)
		goto free_cache;

771 772 773
	ret = io_ctl_check_generation(&io_ctl, generation);
	if (ret)
		goto free_cache;
774

775 776 777
	while (num_entries) {
		e = kmem_cache_zalloc(btrfs_free_space_cachep,
				      GFP_NOFS);
778 779
		if (!e) {
			ret = -ENOMEM;
780
			goto free_cache;
781
		}
782

783 784 785 786 787 788
		ret = io_ctl_read_entry(&io_ctl, e, &type);
		if (ret) {
			kmem_cache_free(btrfs_free_space_cachep, e);
			goto free_cache;
		}

789
		if (!e->bytes) {
790
			ret = -1;
791 792
			kmem_cache_free(btrfs_free_space_cachep, e);
			goto free_cache;
793
		}
794 795 796 797 798 799

		if (type == BTRFS_FREE_SPACE_EXTENT) {
			spin_lock(&ctl->tree_lock);
			ret = link_free_space(ctl, e);
			spin_unlock(&ctl->tree_lock);
			if (ret) {
800
				btrfs_err(fs_info,
801
					"Duplicate entries in free space cache, dumping");
802
				kmem_cache_free(btrfs_free_space_cachep, e);
803 804
				goto free_cache;
			}
805
		} else {
806
			ASSERT(num_bitmaps);
807
			num_bitmaps--;
808 809
			e->bitmap = kmem_cache_zalloc(
					btrfs_free_space_bitmap_cachep, GFP_NOFS);
810
			if (!e->bitmap) {
811
				ret = -ENOMEM;
812 813
				kmem_cache_free(
					btrfs_free_space_cachep, e);
814 815
				goto free_cache;
			}
816 817 818
			spin_lock(&ctl->tree_lock);
			ret = link_free_space(ctl, e);
			ctl->total_bitmaps++;
819
			recalculate_thresholds(ctl);
820 821
			spin_unlock(&ctl->tree_lock);
			if (ret) {
822
				btrfs_err(fs_info,
823
					"Duplicate entries in free space cache, dumping");
824
				kmem_cache_free(btrfs_free_space_cachep, e);
825 826
				goto free_cache;
			}
827
			list_add_tail(&e->list, &bitmaps);
828 829
		}

830 831
		num_entries--;
	}
832

833 834
	io_ctl_unmap_page(&io_ctl);

835 836 837 838 839
	/*
	 * We add the bitmaps at the end of the entries in order that
	 * the bitmap entries are added to the cache.
	 */
	list_for_each_entry_safe(e, n, &bitmaps, list) {
840
		list_del_init(&e->list);
841 842 843
		ret = io_ctl_read_bitmap(&io_ctl, e);
		if (ret)
			goto free_cache;
844 845
	}

846
	io_ctl_drop_pages(&io_ctl);
847 848
	ret = 1;
out:
849
	io_ctl_free(&io_ctl);
850 851
	return ret;
free_cache:
852
	io_ctl_drop_pages(&io_ctl);
853
	__btrfs_remove_free_space_cache(ctl);
854 855 856
	goto out;
}

857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891
static int copy_free_space_cache(struct btrfs_block_group *block_group,
				 struct btrfs_free_space_ctl *ctl)
{
	struct btrfs_free_space *info;
	struct rb_node *n;
	int ret = 0;

	while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) {
		info = rb_entry(n, struct btrfs_free_space, offset_index);
		if (!info->bitmap) {
			unlink_free_space(ctl, info);
			ret = btrfs_add_free_space(block_group, info->offset,
						   info->bytes);
			kmem_cache_free(btrfs_free_space_cachep, info);
		} else {
			u64 offset = info->offset;
			u64 bytes = ctl->unit;

			while (search_bitmap(ctl, info, &offset, &bytes,
					     false) == 0) {
				ret = btrfs_add_free_space(block_group, offset,
							   bytes);
				if (ret)
					break;
				bitmap_clear_bits(ctl, info, offset, bytes);
				offset = info->offset;
				bytes = ctl->unit;
			}
			free_bitmap(ctl, info);
		}
		cond_resched();
	}
	return ret;
}

892
int load_free_space_cache(struct btrfs_block_group *block_group)
J
Josef Bacik 已提交
893
{
894
	struct btrfs_fs_info *fs_info = block_group->fs_info;
895
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
896
	struct btrfs_free_space_ctl tmp_ctl = {};
897 898
	struct inode *inode;
	struct btrfs_path *path;
899
	int ret = 0;
900
	bool matched;
901
	u64 used = block_group->used;
902

903 904 905 906 907 908 909
	/*
	 * Because we could potentially discard our loaded free space, we want
	 * to load everything into a temporary structure first, and then if it's
	 * valid copy it all into the actual free space ctl.
	 */
	btrfs_init_free_space_ctl(block_group, &tmp_ctl);

910 911 912 913
	/*
	 * If this block group has been marked to be cleared for one reason or
	 * another then we can't trust the on disk cache, so just return.
	 */
914
	spin_lock(&block_group->lock);
915 916 917 918
	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
		spin_unlock(&block_group->lock);
		return 0;
	}
919
	spin_unlock(&block_group->lock);
920 921 922 923

	path = btrfs_alloc_path();
	if (!path)
		return 0;
924 925
	path->search_commit_root = 1;
	path->skip_locking = 1;
926

927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945
	/*
	 * We must pass a path with search_commit_root set to btrfs_iget in
	 * order to avoid a deadlock when allocating extents for the tree root.
	 *
	 * When we are COWing an extent buffer from the tree root, when looking
	 * for a free extent, at extent-tree.c:find_free_extent(), we can find
	 * block group without its free space cache loaded. When we find one
	 * we must load its space cache which requires reading its free space
	 * cache's inode item from the root tree. If this inode item is located
	 * in the same leaf that we started COWing before, then we end up in
	 * deadlock on the extent buffer (trying to read lock it when we
	 * previously write locked it).
	 *
	 * It's safe to read the inode item using the commit root because
	 * block groups, once loaded, stay in memory forever (until they are
	 * removed) as well as their space caches once loaded. New block groups
	 * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
	 * we will never try to read their inode item while the fs is mounted.
	 */
946
	inode = lookup_free_space_inode(block_group, path);
947 948 949 950 951
	if (IS_ERR(inode)) {
		btrfs_free_path(path);
		return 0;
	}

952 953 954 955
	/* We may have converted the inode and made the cache invalid. */
	spin_lock(&block_group->lock);
	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
		spin_unlock(&block_group->lock);
956
		btrfs_free_path(path);
957 958 959 960
		goto out;
	}
	spin_unlock(&block_group->lock);

961
	ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl,
962
				      path, block_group->start);
963 964 965 966
	btrfs_free_path(path);
	if (ret <= 0)
		goto out;

967 968
	matched = (tmp_ctl.free_space == (block_group->length - used -
					  block_group->bytes_super));
969

970 971 972 973 974 975 976 977 978 979
	if (matched) {
		ret = copy_free_space_cache(block_group, &tmp_ctl);
		/*
		 * ret == 1 means we successfully loaded the free space cache,
		 * so we need to re-set it here.
		 */
		if (ret == 0)
			ret = 1;
	} else {
		__btrfs_remove_free_space_cache(&tmp_ctl);
J
Jeff Mahoney 已提交
980 981
		btrfs_warn(fs_info,
			   "block group %llu has wrong amount of free space",
982
			   block_group->start);
983 984 985 986 987 988 989 990
		ret = -1;
	}
out:
	if (ret < 0) {
		/* This cache is bogus, make sure it gets cleared */
		spin_lock(&block_group->lock);
		block_group->disk_cache_state = BTRFS_DC_CLEAR;
		spin_unlock(&block_group->lock);
991
		ret = 0;
992

J
Jeff Mahoney 已提交
993 994
		btrfs_warn(fs_info,
			   "failed to load free space cache for block group %llu, rebuilding it now",
995
			   block_group->start);
996 997
	}

998 999 1000
	spin_lock(&ctl->tree_lock);
	btrfs_discard_update_discardable(block_group);
	spin_unlock(&ctl->tree_lock);
1001 1002
	iput(inode);
	return ret;
1003 1004
}

1005
static noinline_for_stack
1006
int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
1007
			      struct btrfs_free_space_ctl *ctl,
1008
			      struct btrfs_block_group *block_group,
1009 1010
			      int *entries, int *bitmaps,
			      struct list_head *bitmap_list)
J
Josef Bacik 已提交
1011
{
1012
	int ret;
1013
	struct btrfs_free_cluster *cluster = NULL;
1014
	struct btrfs_free_cluster *cluster_locked = NULL;
1015
	struct rb_node *node = rb_first(&ctl->free_space_offset);
1016
	struct btrfs_trim_range *trim_entry;
1017

1018
	/* Get the cluster for this block_group if it exists */
1019
	if (block_group && !list_empty(&block_group->cluster_list)) {
1020 1021 1022
		cluster = list_entry(block_group->cluster_list.next,
				     struct btrfs_free_cluster,
				     block_group_list);
1023
	}
1024

1025
	if (!node && cluster) {
1026 1027
		cluster_locked = cluster;
		spin_lock(&cluster_locked->lock);
1028 1029 1030 1031
		node = rb_first(&cluster->root);
		cluster = NULL;
	}

1032 1033 1034
	/* Write out the extent entries */
	while (node) {
		struct btrfs_free_space *e;
J
Josef Bacik 已提交
1035

1036
		e = rb_entry(node, struct btrfs_free_space, offset_index);
1037
		*entries += 1;
J
Josef Bacik 已提交
1038

1039
		ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
1040 1041
				       e->bitmap);
		if (ret)
1042
			goto fail;
1043

1044
		if (e->bitmap) {
1045 1046
			list_add_tail(&e->list, bitmap_list);
			*bitmaps += 1;
1047
		}
1048 1049 1050
		node = rb_next(node);
		if (!node && cluster) {
			node = rb_first(&cluster->root);
1051 1052
			cluster_locked = cluster;
			spin_lock(&cluster_locked->lock);
1053
			cluster = NULL;
1054
		}
1055
	}
1056 1057 1058 1059
	if (cluster_locked) {
		spin_unlock(&cluster_locked->lock);
		cluster_locked = NULL;
	}
1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074

	/*
	 * Make sure we don't miss any range that was removed from our rbtree
	 * because trimming is running. Otherwise after a umount+mount (or crash
	 * after committing the transaction) we would leak free space and get
	 * an inconsistent free space cache report from fsck.
	 */
	list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
		ret = io_ctl_add_entry(io_ctl, trim_entry->start,
				       trim_entry->bytes, NULL);
		if (ret)
			goto fail;
		*entries += 1;
	}

1075 1076
	return 0;
fail:
1077 1078
	if (cluster_locked)
		spin_unlock(&cluster_locked->lock);
1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100
	return -ENOSPC;
}

static noinline_for_stack int
update_cache_item(struct btrfs_trans_handle *trans,
		  struct btrfs_root *root,
		  struct inode *inode,
		  struct btrfs_path *path, u64 offset,
		  int entries, int bitmaps)
{
	struct btrfs_key key;
	struct btrfs_free_space_header *header;
	struct extent_buffer *leaf;
	int ret;

	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
	key.offset = offset;
	key.type = 0;

	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
	if (ret < 0) {
		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1101
				 EXTENT_DELALLOC, 0, 0, NULL);
1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112
		goto fail;
	}
	leaf = path->nodes[0];
	if (ret > 0) {
		struct btrfs_key found_key;
		ASSERT(path->slots[0]);
		path->slots[0]--;
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
		if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
		    found_key.offset != offset) {
			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
1113 1114
					 inode->i_size - 1, EXTENT_DELALLOC, 0,
					 0, NULL);
1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134
			btrfs_release_path(path);
			goto fail;
		}
	}

	BTRFS_I(inode)->generation = trans->transid;
	header = btrfs_item_ptr(leaf, path->slots[0],
				struct btrfs_free_space_header);
	btrfs_set_free_space_entries(leaf, header, entries);
	btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
	btrfs_set_free_space_generation(leaf, header, trans->transid);
	btrfs_mark_buffer_dirty(leaf);
	btrfs_release_path(path);

	return 0;

fail:
	return -1;
}

1135
static noinline_for_stack int write_pinned_extent_entries(
1136
			    struct btrfs_trans_handle *trans,
1137
			    struct btrfs_block_group *block_group,
1138
			    struct btrfs_io_ctl *io_ctl,
1139
			    int *entries)
1140 1141 1142 1143
{
	u64 start, extent_start, extent_end, len;
	struct extent_io_tree *unpin = NULL;
	int ret;
1144

1145 1146 1147
	if (!block_group)
		return 0;

1148 1149 1150
	/*
	 * We want to add any pinned extents to our free space cache
	 * so we don't leak the space
1151
	 *
1152 1153 1154
	 * We shouldn't have switched the pinned extents yet so this is the
	 * right one
	 */
1155
	unpin = &trans->transaction->pinned_extents;
1156

1157
	start = block_group->start;
1158

1159
	while (start < block_group->start + block_group->length) {
1160 1161
		ret = find_first_extent_bit(unpin, start,
					    &extent_start, &extent_end,
1162
					    EXTENT_DIRTY, NULL);
1163 1164
		if (ret)
			return 0;
J
Josef Bacik 已提交
1165

1166
		/* This pinned extent is out of our range */
1167
		if (extent_start >= block_group->start + block_group->length)
1168
			return 0;
1169

1170
		extent_start = max(extent_start, start);
1171 1172
		extent_end = min(block_group->start + block_group->length,
				 extent_end + 1);
1173
		len = extent_end - extent_start;
J
Josef Bacik 已提交
1174

1175 1176
		*entries += 1;
		ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
1177
		if (ret)
1178
			return -ENOSPC;
J
Josef Bacik 已提交
1179

1180
		start = extent_end;
1181
	}
J
Josef Bacik 已提交
1182

1183 1184 1185 1186
	return 0;
}

static noinline_for_stack int
1187
write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
1188
{
1189
	struct btrfs_free_space *entry, *next;
1190 1191
	int ret;

J
Josef Bacik 已提交
1192
	/* Write out the bitmaps */
1193
	list_for_each_entry_safe(entry, next, bitmap_list, list) {
1194
		ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
1195
		if (ret)
1196
			return -ENOSPC;
J
Josef Bacik 已提交
1197
		list_del_init(&entry->list);
1198 1199
	}

1200 1201
	return 0;
}
J
Josef Bacik 已提交
1202

1203 1204 1205
static int flush_dirty_cache(struct inode *inode)
{
	int ret;
1206

1207
	ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
1208
	if (ret)
1209
		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1210
				 EXTENT_DELALLOC, 0, 0, NULL);
J
Josef Bacik 已提交
1211

1212
	return ret;
1213 1214 1215
}

static void noinline_for_stack
1216
cleanup_bitmap_list(struct list_head *bitmap_list)
1217
{
1218
	struct btrfs_free_space *entry, *next;
1219

1220
	list_for_each_entry_safe(entry, next, bitmap_list, list)
1221
		list_del_init(&entry->list);
1222 1223 1224 1225 1226
}

static void noinline_for_stack
cleanup_write_cache_enospc(struct inode *inode,
			   struct btrfs_io_ctl *io_ctl,
1227
			   struct extent_state **cached_state)
1228
{
1229 1230
	io_ctl_drop_pages(io_ctl);
	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1231
			     i_size_read(inode) - 1, cached_state);
1232
}
1233

1234 1235
static int __btrfs_wait_cache_io(struct btrfs_root *root,
				 struct btrfs_trans_handle *trans,
1236
				 struct btrfs_block_group *block_group,
1237 1238
				 struct btrfs_io_ctl *io_ctl,
				 struct btrfs_path *path, u64 offset)
1239 1240 1241 1242
{
	int ret;
	struct inode *inode = io_ctl->inode;

1243 1244 1245
	if (!inode)
		return 0;

1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257
	/* Flush the dirty pages in the cache file. */
	ret = flush_dirty_cache(inode);
	if (ret)
		goto out;

	/* Update the cache item to tell everyone this cache file is valid. */
	ret = update_cache_item(trans, root, inode, path, offset,
				io_ctl->entries, io_ctl->bitmaps);
out:
	if (ret) {
		invalidate_inode_pages2(inode->i_mapping);
		BTRFS_I(inode)->generation = 0;
1258 1259
		if (block_group)
			btrfs_debug(root->fs_info,
1260 1261
	  "failed to write free space cache for block group %llu error %d",
				  block_group->start, ret);
1262
	}
1263
	btrfs_update_inode(trans, root, BTRFS_I(inode));
1264 1265

	if (block_group) {
1266 1267 1268 1269
		/* the dirty list is protected by the dirty_bgs_lock */
		spin_lock(&trans->transaction->dirty_bgs_lock);

		/* the disk_cache_state is protected by the block group lock */
1270 1271 1272 1273
		spin_lock(&block_group->lock);

		/*
		 * only mark this as written if we didn't get put back on
1274 1275
		 * the dirty list while waiting for IO.   Otherwise our
		 * cache state won't be right, and we won't get written again
1276 1277 1278 1279 1280 1281 1282
		 */
		if (!ret && list_empty(&block_group->dirty_list))
			block_group->disk_cache_state = BTRFS_DC_WRITTEN;
		else if (ret)
			block_group->disk_cache_state = BTRFS_DC_ERROR;

		spin_unlock(&block_group->lock);
1283
		spin_unlock(&trans->transaction->dirty_bgs_lock);
1284 1285 1286 1287 1288 1289 1290 1291
		io_ctl->inode = NULL;
		iput(inode);
	}

	return ret;

}

1292
int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
1293
			struct btrfs_block_group *block_group,
1294 1295 1296 1297
			struct btrfs_path *path)
{
	return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
				     block_group, &block_group->io_ctl,
1298
				     path, block_group->start);
1299 1300
}

1301
/**
1302 1303 1304 1305 1306 1307 1308 1309
 * Write out cached info to an inode
 *
 * @root:        root the inode belongs to
 * @inode:       freespace inode we are writing out
 * @ctl:         free space cache we are going to write out
 * @block_group: block_group for this cache if it belongs to a block_group
 * @io_ctl:      holds context for the io
 * @trans:       the trans handle
1310 1311
 *
 * This function writes out a free space cache struct to disk for quick recovery
G
Geliang Tang 已提交
1312
 * on mount.  This will return 0 if it was successful in writing the cache out,
1313
 * or an errno if it was not.
1314 1315 1316
 */
static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
				   struct btrfs_free_space_ctl *ctl,
1317
				   struct btrfs_block_group *block_group,
1318
				   struct btrfs_io_ctl *io_ctl,
1319
				   struct btrfs_trans_handle *trans)
1320 1321
{
	struct extent_state *cached_state = NULL;
1322
	LIST_HEAD(bitmap_list);
1323 1324 1325
	int entries = 0;
	int bitmaps = 0;
	int ret;
1326
	int must_iput = 0;
1327 1328

	if (!i_size_read(inode))
1329
		return -EIO;
1330

1331
	WARN_ON(io_ctl->pages);
1332
	ret = io_ctl_init(io_ctl, inode, 1);
1333
	if (ret)
1334
		return ret;
1335

1336 1337 1338 1339 1340 1341 1342 1343 1344
	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
		down_write(&block_group->data_rwsem);
		spin_lock(&block_group->lock);
		if (block_group->delalloc_bytes) {
			block_group->disk_cache_state = BTRFS_DC_WRITTEN;
			spin_unlock(&block_group->lock);
			up_write(&block_group->data_rwsem);
			BTRFS_I(inode)->generation = 0;
			ret = 0;
1345
			must_iput = 1;
1346 1347 1348 1349 1350
			goto out;
		}
		spin_unlock(&block_group->lock);
	}

1351
	/* Lock all pages first so we can lock the extent safely. */
1352
	ret = io_ctl_prepare_pages(io_ctl, false);
1353
	if (ret)
1354
		goto out_unlock;
1355 1356

	lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1357
			 &cached_state);
1358

1359
	io_ctl_set_generation(io_ctl, trans->transid);
1360

1361
	mutex_lock(&ctl->cache_writeout_mutex);
1362
	/* Write out the extent entries in the free space cache */
1363
	spin_lock(&ctl->tree_lock);
1364
	ret = write_cache_extent_entries(io_ctl, ctl,
1365 1366
					 block_group, &entries, &bitmaps,
					 &bitmap_list);
1367 1368
	if (ret)
		goto out_nospc_locked;
1369

1370 1371 1372 1373
	/*
	 * Some spaces that are freed in the current transaction are pinned,
	 * they will be added into free space cache after the transaction is
	 * committed, we shouldn't lose them.
1374 1375 1376
	 *
	 * If this changes while we are working we'll get added back to
	 * the dirty list and redo it.  No locking needed
1377
	 */
1378
	ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries);
1379 1380
	if (ret)
		goto out_nospc_locked;
1381

1382 1383 1384 1385 1386
	/*
	 * At last, we write out all the bitmaps and keep cache_writeout_mutex
	 * locked while doing it because a concurrent trim can be manipulating
	 * or freeing the bitmap.
	 */
1387
	ret = write_bitmap_entries(io_ctl, &bitmap_list);
1388
	spin_unlock(&ctl->tree_lock);
1389
	mutex_unlock(&ctl->cache_writeout_mutex);
1390 1391 1392 1393
	if (ret)
		goto out_nospc;

	/* Zero out the rest of the pages just to make sure */
1394
	io_ctl_zero_remaining_pages(io_ctl);
1395

1396
	/* Everything is written out, now we dirty the pages in the file. */
1397 1398
	ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages,
				io_ctl->num_pages, 0, i_size_read(inode),
1399
				&cached_state, false);
1400
	if (ret)
1401
		goto out_nospc;
1402

1403 1404
	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
		up_write(&block_group->data_rwsem);
1405 1406 1407 1408
	/*
	 * Release the pages and unlock the extent, we will flush
	 * them out later
	 */
1409
	io_ctl_drop_pages(io_ctl);
1410
	io_ctl_free(io_ctl);
1411 1412

	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1413
			     i_size_read(inode) - 1, &cached_state);
1414

1415 1416
	/*
	 * at this point the pages are under IO and we're happy,
1417
	 * The caller is responsible for waiting on them and updating
1418 1419 1420 1421 1422 1423
	 * the cache and the inode
	 */
	io_ctl->entries = entries;
	io_ctl->bitmaps = bitmaps;

	ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
1424
	if (ret)
1425 1426
		goto out;

1427 1428
	return 0;

1429 1430 1431 1432 1433
out_nospc_locked:
	cleanup_bitmap_list(&bitmap_list);
	spin_unlock(&ctl->tree_lock);
	mutex_unlock(&ctl->cache_writeout_mutex);

1434
out_nospc:
1435
	cleanup_write_cache_enospc(inode, io_ctl, &cached_state);
1436

1437
out_unlock:
1438 1439 1440
	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
		up_write(&block_group->data_rwsem);

1441 1442 1443 1444 1445 1446 1447
out:
	io_ctl->inode = NULL;
	io_ctl_free(io_ctl);
	if (ret) {
		invalidate_inode_pages2(inode->i_mapping);
		BTRFS_I(inode)->generation = 0;
	}
1448
	btrfs_update_inode(trans, root, BTRFS_I(inode));
1449 1450 1451
	if (must_iput)
		iput(inode);
	return ret;
1452 1453
}

1454
int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
1455
			  struct btrfs_block_group *block_group,
1456 1457
			  struct btrfs_path *path)
{
1458
	struct btrfs_fs_info *fs_info = trans->fs_info;
1459 1460 1461 1462 1463 1464 1465
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
	struct inode *inode;
	int ret = 0;

	spin_lock(&block_group->lock);
	if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
		spin_unlock(&block_group->lock);
1466 1467
		return 0;
	}
1468 1469
	spin_unlock(&block_group->lock);

1470
	inode = lookup_free_space_inode(block_group, path);
1471 1472 1473
	if (IS_ERR(inode))
		return 0;

1474 1475
	ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
				block_group, &block_group->io_ctl, trans);
1476
	if (ret) {
1477
		btrfs_debug(fs_info,
1478 1479
	  "failed to write free space cache for block group %llu error %d",
			  block_group->start, ret);
1480 1481 1482 1483 1484 1485
		spin_lock(&block_group->lock);
		block_group->disk_cache_state = BTRFS_DC_ERROR;
		spin_unlock(&block_group->lock);

		block_group->io_ctl.inode = NULL;
		iput(inode);
1486 1487
	}

1488 1489 1490 1491 1492
	/*
	 * if ret == 0 the caller is expected to call btrfs_wait_cache_io
	 * to wait for IO and put the inode
	 */

J
Josef Bacik 已提交
1493 1494 1495
	return ret;
}

1496
static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
1497
					  u64 offset)
J
Josef Bacik 已提交
1498
{
1499
	ASSERT(offset >= bitmap_start);
1500
	offset -= bitmap_start;
1501
	return (unsigned long)(div_u64(offset, unit));
1502
}
J
Josef Bacik 已提交
1503

1504
static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
1505
{
1506
	return (unsigned long)(div_u64(bytes, unit));
1507
}
J
Josef Bacik 已提交
1508

1509
static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
1510 1511 1512
				   u64 offset)
{
	u64 bitmap_start;
1513
	u64 bytes_per_bitmap;
J
Josef Bacik 已提交
1514

1515 1516
	bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
	bitmap_start = offset - ctl->start;
1517
	bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
1518
	bitmap_start *= bytes_per_bitmap;
1519
	bitmap_start += ctl->start;
J
Josef Bacik 已提交
1520

1521
	return bitmap_start;
J
Josef Bacik 已提交
1522 1523
}

1524 1525
static int tree_insert_offset(struct rb_root *root, u64 offset,
			      struct rb_node *node, int bitmap)
J
Josef Bacik 已提交
1526 1527 1528 1529 1530 1531 1532
{
	struct rb_node **p = &root->rb_node;
	struct rb_node *parent = NULL;
	struct btrfs_free_space *info;

	while (*p) {
		parent = *p;
1533
		info = rb_entry(parent, struct btrfs_free_space, offset_index);
J
Josef Bacik 已提交
1534

1535
		if (offset < info->offset) {
J
Josef Bacik 已提交
1536
			p = &(*p)->rb_left;
1537
		} else if (offset > info->offset) {
J
Josef Bacik 已提交
1538
			p = &(*p)->rb_right;
1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553
		} else {
			/*
			 * we could have a bitmap entry and an extent entry
			 * share the same offset.  If this is the case, we want
			 * the extent entry to always be found first if we do a
			 * linear search through the tree, since we want to have
			 * the quickest allocation time, and allocating from an
			 * extent is faster than allocating from a bitmap.  So
			 * if we're inserting a bitmap and we find an entry at
			 * this offset, we want to go right, or after this entry
			 * logically.  If we are inserting an extent and we've
			 * found a bitmap, we want to go left, or before
			 * logically.
			 */
			if (bitmap) {
1554 1555 1556 1557
				if (info->bitmap) {
					WARN_ON_ONCE(1);
					return -EEXIST;
				}
1558 1559
				p = &(*p)->rb_right;
			} else {
1560 1561 1562 1563
				if (!info->bitmap) {
					WARN_ON_ONCE(1);
					return -EEXIST;
				}
1564 1565 1566
				p = &(*p)->rb_left;
			}
		}
J
Josef Bacik 已提交
1567 1568 1569 1570 1571 1572 1573 1574 1575
	}

	rb_link_node(node, parent, p);
	rb_insert_color(node, root);

	return 0;
}

/*
J
Josef Bacik 已提交
1576 1577
 * searches the tree for the given offset.
 *
1578 1579 1580
 * fuzzy - If this is set, then we are trying to make an allocation, and we just
 * want a section that has at least bytes size and comes at or after the given
 * offset.
J
Josef Bacik 已提交
1581
 */
1582
static struct btrfs_free_space *
1583
tree_search_offset(struct btrfs_free_space_ctl *ctl,
1584
		   u64 offset, int bitmap_only, int fuzzy)
J
Josef Bacik 已提交
1585
{
1586
	struct rb_node *n = ctl->free_space_offset.rb_node;
1587 1588 1589 1590 1591 1592 1593 1594
	struct btrfs_free_space *entry, *prev = NULL;

	/* find entry that is closest to the 'offset' */
	while (1) {
		if (!n) {
			entry = NULL;
			break;
		}
J
Josef Bacik 已提交
1595 1596

		entry = rb_entry(n, struct btrfs_free_space, offset_index);
1597
		prev = entry;
J
Josef Bacik 已提交
1598

1599
		if (offset < entry->offset)
J
Josef Bacik 已提交
1600
			n = n->rb_left;
1601
		else if (offset > entry->offset)
J
Josef Bacik 已提交
1602
			n = n->rb_right;
1603
		else
J
Josef Bacik 已提交
1604 1605 1606
			break;
	}

1607 1608 1609 1610 1611
	if (bitmap_only) {
		if (!entry)
			return NULL;
		if (entry->bitmap)
			return entry;
J
Josef Bacik 已提交
1612

1613 1614 1615 1616 1617 1618 1619 1620 1621 1622
		/*
		 * bitmap entry and extent entry may share same offset,
		 * in that case, bitmap entry comes after extent entry.
		 */
		n = rb_next(n);
		if (!n)
			return NULL;
		entry = rb_entry(n, struct btrfs_free_space, offset_index);
		if (entry->offset != offset)
			return NULL;
J
Josef Bacik 已提交
1623

1624 1625 1626 1627
		WARN_ON(!entry->bitmap);
		return entry;
	} else if (entry) {
		if (entry->bitmap) {
J
Josef Bacik 已提交
1628
			/*
1629 1630
			 * if previous extent entry covers the offset,
			 * we should return it instead of the bitmap entry
J
Josef Bacik 已提交
1631
			 */
1632 1633
			n = rb_prev(&entry->offset_index);
			if (n) {
1634 1635
				prev = rb_entry(n, struct btrfs_free_space,
						offset_index);
1636 1637 1638
				if (!prev->bitmap &&
				    prev->offset + prev->bytes > offset)
					entry = prev;
J
Josef Bacik 已提交
1639
			}
1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653
		}
		return entry;
	}

	if (!prev)
		return NULL;

	/* find last entry before the 'offset' */
	entry = prev;
	if (entry->offset > offset) {
		n = rb_prev(&entry->offset_index);
		if (n) {
			entry = rb_entry(n, struct btrfs_free_space,
					offset_index);
1654
			ASSERT(entry->offset <= offset);
J
Josef Bacik 已提交
1655
		} else {
1656 1657 1658 1659
			if (fuzzy)
				return entry;
			else
				return NULL;
J
Josef Bacik 已提交
1660 1661 1662
		}
	}

1663
	if (entry->bitmap) {
1664 1665
		n = rb_prev(&entry->offset_index);
		if (n) {
1666 1667
			prev = rb_entry(n, struct btrfs_free_space,
					offset_index);
1668 1669 1670
			if (!prev->bitmap &&
			    prev->offset + prev->bytes > offset)
				return prev;
1671
		}
1672
		if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1673 1674 1675 1676 1677 1678 1679 1680 1681 1682
			return entry;
	} else if (entry->offset + entry->bytes > offset)
		return entry;

	if (!fuzzy)
		return NULL;

	while (1) {
		if (entry->bitmap) {
			if (entry->offset + BITS_PER_BITMAP *
1683
			    ctl->unit > offset)
1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695
				break;
		} else {
			if (entry->offset + entry->bytes > offset)
				break;
		}

		n = rb_next(&entry->offset_index);
		if (!n)
			return NULL;
		entry = rb_entry(n, struct btrfs_free_space, offset_index);
	}
	return entry;
J
Josef Bacik 已提交
1696 1697
}

1698
static inline void
1699
__unlink_free_space(struct btrfs_free_space_ctl *ctl,
1700
		    struct btrfs_free_space *info)
J
Josef Bacik 已提交
1701
{
1702 1703
	rb_erase(&info->offset_index, &ctl->free_space_offset);
	ctl->free_extents--;
1704

1705
	if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
1706
		ctl->discardable_extents[BTRFS_STAT_CURR]--;
1707 1708
		ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes;
	}
1709 1710
}

1711
static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
1712 1713
			      struct btrfs_free_space *info)
{
1714 1715
	__unlink_free_space(ctl, info);
	ctl->free_space -= info->bytes;
J
Josef Bacik 已提交
1716 1717
}

1718
static int link_free_space(struct btrfs_free_space_ctl *ctl,
J
Josef Bacik 已提交
1719 1720 1721 1722
			   struct btrfs_free_space *info)
{
	int ret = 0;

1723
	ASSERT(info->bytes || info->bitmap);
1724
	ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
1725
				 &info->offset_index, (info->bitmap != NULL));
J
Josef Bacik 已提交
1726 1727 1728
	if (ret)
		return ret;

1729
	if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
1730
		ctl->discardable_extents[BTRFS_STAT_CURR]++;
1731 1732
		ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
	}
1733

1734 1735
	ctl->free_space += info->bytes;
	ctl->free_extents++;
1736 1737 1738
	return ret;
}

1739 1740 1741
static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
				       struct btrfs_free_space *info,
				       u64 offset, u64 bytes)
1742
{
1743 1744
	unsigned long start, count, end;
	int extent_delta = -1;
1745

1746 1747
	start = offset_to_bit(info->offset, ctl->unit, offset);
	count = bytes_to_bits(bytes, ctl->unit);
1748 1749
	end = start + count;
	ASSERT(end <= BITS_PER_BITMAP);
1750

L
Li Zefan 已提交
1751
	bitmap_clear(info->bitmap, start, count);
1752 1753

	info->bytes -= bytes;
1754 1755
	if (info->max_extent_size > ctl->unit)
		info->max_extent_size = 0;
1756 1757 1758 1759 1760 1761 1762 1763

	if (start && test_bit(start - 1, info->bitmap))
		extent_delta++;

	if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
		extent_delta++;

	info->bitmap_extents += extent_delta;
1764
	if (!btrfs_free_space_trimmed(info)) {
1765
		ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
1766 1767
		ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
	}
1768 1769 1770 1771 1772 1773 1774
}

static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
			      struct btrfs_free_space *info, u64 offset,
			      u64 bytes)
{
	__bitmap_clear_bits(ctl, info, offset, bytes);
1775
	ctl->free_space -= bytes;
1776 1777
}

1778
static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
J
Josef Bacik 已提交
1779 1780
			    struct btrfs_free_space *info, u64 offset,
			    u64 bytes)
1781
{
1782 1783
	unsigned long start, count, end;
	int extent_delta = 1;
1784

1785 1786
	start = offset_to_bit(info->offset, ctl->unit, offset);
	count = bytes_to_bits(bytes, ctl->unit);
1787 1788
	end = start + count;
	ASSERT(end <= BITS_PER_BITMAP);
1789

L
Li Zefan 已提交
1790
	bitmap_set(info->bitmap, start, count);
1791 1792

	info->bytes += bytes;
1793
	ctl->free_space += bytes;
1794 1795 1796 1797 1798 1799 1800 1801

	if (start && test_bit(start - 1, info->bitmap))
		extent_delta--;

	if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
		extent_delta--;

	info->bitmap_extents += extent_delta;
1802
	if (!btrfs_free_space_trimmed(info)) {
1803
		ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
1804 1805
		ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes;
	}
1806 1807
}

1808 1809 1810 1811
/*
 * If we can not find suitable extent, we will use bytes to record
 * the size of the max extent.
 */
1812
static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1813
			 struct btrfs_free_space *bitmap_info, u64 *offset,
1814
			 u64 *bytes, bool for_alloc)
1815 1816
{
	unsigned long found_bits = 0;
1817
	unsigned long max_bits = 0;
1818 1819
	unsigned long bits, i;
	unsigned long next_zero;
1820
	unsigned long extent_bits;
1821

1822 1823 1824 1825
	/*
	 * Skip searching the bitmap if we don't have a contiguous section that
	 * is large enough for this allocation.
	 */
1826 1827
	if (for_alloc &&
	    bitmap_info->max_extent_size &&
1828 1829 1830 1831 1832
	    bitmap_info->max_extent_size < *bytes) {
		*bytes = bitmap_info->max_extent_size;
		return -1;
	}

1833
	i = offset_to_bit(bitmap_info->offset, ctl->unit,
1834
			  max_t(u64, *offset, bitmap_info->offset));
1835
	bits = bytes_to_bits(*bytes, ctl->unit);
1836

1837
	for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
1838 1839 1840 1841
		if (for_alloc && bits == 1) {
			found_bits = 1;
			break;
		}
1842 1843
		next_zero = find_next_zero_bit(bitmap_info->bitmap,
					       BITS_PER_BITMAP, i);
1844 1845 1846
		extent_bits = next_zero - i;
		if (extent_bits >= bits) {
			found_bits = extent_bits;
1847
			break;
1848 1849
		} else if (extent_bits > max_bits) {
			max_bits = extent_bits;
1850 1851 1852 1853 1854
		}
		i = next_zero;
	}

	if (found_bits) {
1855 1856
		*offset = (u64)(i * ctl->unit) + bitmap_info->offset;
		*bytes = (u64)(found_bits) * ctl->unit;
1857 1858 1859
		return 0;
	}

1860
	*bytes = (u64)(max_bits) * ctl->unit;
1861
	bitmap_info->max_extent_size = *bytes;
1862 1863 1864
	return -1;
}

J
Josef Bacik 已提交
1865 1866 1867 1868 1869 1870 1871
static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
{
	if (entry->bitmap)
		return entry->max_extent_size;
	return entry->bytes;
}

1872
/* Cache the size of the max extent in bytes */
1873
static struct btrfs_free_space *
D
David Woodhouse 已提交
1874
find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
1875
		unsigned long align, u64 *max_extent_size)
1876 1877 1878
{
	struct btrfs_free_space *entry;
	struct rb_node *node;
D
David Woodhouse 已提交
1879 1880
	u64 tmp;
	u64 align_off;
1881 1882
	int ret;

1883
	if (!ctl->free_space_offset.rb_node)
1884
		goto out;
1885

1886
	entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
1887
	if (!entry)
1888
		goto out;
1889 1890 1891

	for (node = &entry->offset_index; node; node = rb_next(node)) {
		entry = rb_entry(node, struct btrfs_free_space, offset_index);
1892
		if (entry->bytes < *bytes) {
J
Josef Bacik 已提交
1893 1894
			*max_extent_size = max(get_max_extent_size(entry),
					       *max_extent_size);
1895
			continue;
1896
		}
1897

D
David Woodhouse 已提交
1898 1899 1900 1901
		/* make sure the space returned is big enough
		 * to match our requested alignment
		 */
		if (*bytes >= align) {
1902
			tmp = entry->offset - ctl->start + align - 1;
1903
			tmp = div64_u64(tmp, align);
D
David Woodhouse 已提交
1904 1905 1906 1907 1908 1909 1910
			tmp = tmp * align + ctl->start;
			align_off = tmp - entry->offset;
		} else {
			align_off = 0;
			tmp = entry->offset;
		}

1911
		if (entry->bytes < *bytes + align_off) {
J
Josef Bacik 已提交
1912 1913
			*max_extent_size = max(get_max_extent_size(entry),
					       *max_extent_size);
D
David Woodhouse 已提交
1914
			continue;
1915
		}
D
David Woodhouse 已提交
1916

1917
		if (entry->bitmap) {
1918 1919
			u64 size = *bytes;

1920
			ret = search_bitmap(ctl, entry, &tmp, &size, true);
D
David Woodhouse 已提交
1921 1922
			if (!ret) {
				*offset = tmp;
1923
				*bytes = size;
1924
				return entry;
J
Josef Bacik 已提交
1925 1926 1927 1928
			} else {
				*max_extent_size =
					max(get_max_extent_size(entry),
					    *max_extent_size);
D
David Woodhouse 已提交
1929
			}
1930 1931 1932
			continue;
		}

D
David Woodhouse 已提交
1933 1934
		*offset = tmp;
		*bytes = entry->bytes - align_off;
1935 1936
		return entry;
	}
1937
out:
1938 1939 1940
	return NULL;
}

1941
static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
1942 1943
			   struct btrfs_free_space *info, u64 offset)
{
1944
	info->offset = offset_to_bitmap(ctl, offset);
J
Josef Bacik 已提交
1945
	info->bytes = 0;
1946
	info->bitmap_extents = 0;
1947
	INIT_LIST_HEAD(&info->list);
1948 1949
	link_free_space(ctl, info);
	ctl->total_bitmaps++;
1950
	recalculate_thresholds(ctl);
1951 1952
}

1953
static void free_bitmap(struct btrfs_free_space_ctl *ctl,
1954 1955
			struct btrfs_free_space *bitmap_info)
{
1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967
	/*
	 * Normally when this is called, the bitmap is completely empty. However,
	 * if we are blowing up the free space cache for one reason or another
	 * via __btrfs_remove_free_space_cache(), then it may not be freed and
	 * we may leave stats on the table.
	 */
	if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) {
		ctl->discardable_extents[BTRFS_STAT_CURR] -=
			bitmap_info->bitmap_extents;
		ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes;

	}
1968
	unlink_free_space(ctl, bitmap_info);
1969
	kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
1970
	kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
1971
	ctl->total_bitmaps--;
1972
	recalculate_thresholds(ctl);
1973 1974
}

1975
static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
1976 1977 1978 1979
			      struct btrfs_free_space *bitmap_info,
			      u64 *offset, u64 *bytes)
{
	u64 end;
1980 1981
	u64 search_start, search_bytes;
	int ret;
1982 1983

again:
1984
	end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
1985

1986
	/*
1987 1988 1989 1990
	 * We need to search for bits in this bitmap.  We could only cover some
	 * of the extent in this bitmap thanks to how we add space, so we need
	 * to search for as much as it as we can and clear that amount, and then
	 * go searching for the next bit.
1991 1992
	 */
	search_start = *offset;
1993
	search_bytes = ctl->unit;
1994
	search_bytes = min(search_bytes, end - search_start + 1);
1995 1996
	ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
			    false);
1997 1998
	if (ret < 0 || search_start != *offset)
		return -EINVAL;
1999

2000 2001 2002 2003 2004 2005 2006 2007 2008
	/* We may have found more bits than what we need */
	search_bytes = min(search_bytes, *bytes);

	/* Cannot clear past the end of the bitmap */
	search_bytes = min(search_bytes, end - search_start + 1);

	bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
	*offset += search_bytes;
	*bytes -= search_bytes;
2009 2010

	if (*bytes) {
2011
		struct rb_node *next = rb_next(&bitmap_info->offset_index);
2012
		if (!bitmap_info->bytes)
2013
			free_bitmap(ctl, bitmap_info);
2014

2015 2016 2017 2018 2019
		/*
		 * no entry after this bitmap, but we still have bytes to
		 * remove, so something has gone wrong.
		 */
		if (!next)
2020 2021
			return -EINVAL;

2022 2023 2024 2025 2026 2027 2028
		bitmap_info = rb_entry(next, struct btrfs_free_space,
				       offset_index);

		/*
		 * if the next entry isn't a bitmap we need to return to let the
		 * extent stuff do its work.
		 */
2029 2030 2031
		if (!bitmap_info->bitmap)
			return -EAGAIN;

2032 2033 2034 2035 2036 2037 2038
		/*
		 * Ok the next item is a bitmap, but it may not actually hold
		 * the information for the rest of this free space stuff, so
		 * look for it, and if we don't find it return so we can try
		 * everything over again.
		 */
		search_start = *offset;
2039
		search_bytes = ctl->unit;
2040
		ret = search_bitmap(ctl, bitmap_info, &search_start,
2041
				    &search_bytes, false);
2042 2043 2044
		if (ret < 0 || search_start != *offset)
			return -EAGAIN;

2045
		goto again;
2046
	} else if (!bitmap_info->bytes)
2047
		free_bitmap(ctl, bitmap_info);
2048 2049 2050 2051

	return 0;
}

J
Josef Bacik 已提交
2052 2053
static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
			       struct btrfs_free_space *info, u64 offset,
2054
			       u64 bytes, enum btrfs_trim_state trim_state)
J
Josef Bacik 已提交
2055 2056 2057 2058
{
	u64 bytes_to_set = 0;
	u64 end;

2059 2060 2061 2062
	/*
	 * This is a tradeoff to make bitmap trim state minimal.  We mark the
	 * whole bitmap untrimmed if at any point we add untrimmed regions.
	 */
2063
	if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) {
2064
		if (btrfs_free_space_trimmed(info)) {
2065 2066
			ctl->discardable_extents[BTRFS_STAT_CURR] +=
				info->bitmap_extents;
2067 2068
			ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
		}
2069
		info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2070
	}
2071

J
Josef Bacik 已提交
2072 2073 2074 2075 2076 2077
	end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);

	bytes_to_set = min(end - offset, bytes);

	bitmap_set_bits(ctl, info, offset, bytes_to_set);

2078 2079 2080 2081 2082 2083
	/*
	 * We set some bytes, we have no idea what the max extent size is
	 * anymore.
	 */
	info->max_extent_size = 0;

J
Josef Bacik 已提交
2084 2085 2086 2087
	return bytes_to_set;

}

2088 2089
static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
		      struct btrfs_free_space *info)
2090
{
2091
	struct btrfs_block_group *block_group = ctl->private;
2092
	struct btrfs_fs_info *fs_info = block_group->fs_info;
2093 2094 2095
	bool forced = false;

#ifdef CONFIG_BTRFS_DEBUG
2096
	if (btrfs_should_fragment_free_space(block_group))
2097 2098
		forced = true;
#endif
2099

2100 2101 2102 2103
	/* This is a way to reclaim large regions from the bitmaps. */
	if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD)
		return false;

2104 2105 2106 2107
	/*
	 * If we are below the extents threshold then we can add this as an
	 * extent, and don't have to deal with the bitmap
	 */
2108
	if (!forced && ctl->free_extents < ctl->extents_thresh) {
2109 2110 2111
		/*
		 * If this block group has some small extents we don't want to
		 * use up all of our free slots in the cache with them, we want
2112
		 * to reserve them to larger extents, however if we have plenty
2113 2114 2115
		 * of cache left then go ahead an dadd them, no sense in adding
		 * the overhead of a bitmap if we don't have to.
		 */
2116 2117
		if (info->bytes <= fs_info->sectorsize * 8) {
			if (ctl->free_extents * 3 <= ctl->extents_thresh)
2118
				return false;
2119
		} else {
2120
			return false;
2121 2122
		}
	}
2123 2124

	/*
2125 2126 2127 2128 2129 2130
	 * The original block groups from mkfs can be really small, like 8
	 * megabytes, so don't bother with a bitmap for those entries.  However
	 * some block groups can be smaller than what a bitmap would cover but
	 * are still large enough that they could overflow the 32k memory limit,
	 * so allow those block groups to still be allowed to have a bitmap
	 * entry.
2131
	 */
2132
	if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length)
2133 2134 2135 2136 2137
		return false;

	return true;
}

2138
static const struct btrfs_free_space_op free_space_op = {
J
Josef Bacik 已提交
2139 2140 2141
	.use_bitmap		= use_bitmap,
};

2142 2143 2144 2145
static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
			      struct btrfs_free_space *info)
{
	struct btrfs_free_space *bitmap_info;
2146
	struct btrfs_block_group *block_group = NULL;
2147
	int added = 0;
J
Josef Bacik 已提交
2148
	u64 bytes, offset, bytes_added;
2149
	enum btrfs_trim_state trim_state;
2150
	int ret;
2151 2152 2153

	bytes = info->bytes;
	offset = info->offset;
2154
	trim_state = info->trim_state;
2155

2156 2157 2158
	if (!ctl->op->use_bitmap(ctl, info))
		return 0;

J
Josef Bacik 已提交
2159 2160
	if (ctl->op == &free_space_op)
		block_group = ctl->private;
2161
again:
J
Josef Bacik 已提交
2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178
	/*
	 * Since we link bitmaps right into the cluster we need to see if we
	 * have a cluster here, and if so and it has our bitmap we need to add
	 * the free space to that bitmap.
	 */
	if (block_group && !list_empty(&block_group->cluster_list)) {
		struct btrfs_free_cluster *cluster;
		struct rb_node *node;
		struct btrfs_free_space *entry;

		cluster = list_entry(block_group->cluster_list.next,
				     struct btrfs_free_cluster,
				     block_group_list);
		spin_lock(&cluster->lock);
		node = rb_first(&cluster->root);
		if (!node) {
			spin_unlock(&cluster->lock);
2179
			goto no_cluster_bitmap;
J
Josef Bacik 已提交
2180 2181 2182 2183 2184
		}

		entry = rb_entry(node, struct btrfs_free_space, offset_index);
		if (!entry->bitmap) {
			spin_unlock(&cluster->lock);
2185
			goto no_cluster_bitmap;
J
Josef Bacik 已提交
2186 2187 2188
		}

		if (entry->offset == offset_to_bitmap(ctl, offset)) {
2189 2190
			bytes_added = add_bytes_to_bitmap(ctl, entry, offset,
							  bytes, trim_state);
J
Josef Bacik 已提交
2191 2192 2193 2194 2195 2196 2197 2198 2199
			bytes -= bytes_added;
			offset += bytes_added;
		}
		spin_unlock(&cluster->lock);
		if (!bytes) {
			ret = 1;
			goto out;
		}
	}
2200 2201

no_cluster_bitmap:
2202
	bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2203 2204
					 1, 0);
	if (!bitmap_info) {
2205
		ASSERT(added == 0);
2206 2207 2208
		goto new_bitmap;
	}

2209 2210
	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
					  trim_state);
J
Josef Bacik 已提交
2211 2212 2213
	bytes -= bytes_added;
	offset += bytes_added;
	added = 0;
2214 2215 2216 2217 2218 2219 2220 2221 2222

	if (!bytes) {
		ret = 1;
		goto out;
	} else
		goto again;

new_bitmap:
	if (info && info->bitmap) {
2223
		add_new_bitmap(ctl, info, offset);
2224 2225 2226 2227
		added = 1;
		info = NULL;
		goto again;
	} else {
2228
		spin_unlock(&ctl->tree_lock);
2229 2230 2231

		/* no pre-allocated info, allocate a new one */
		if (!info) {
2232 2233
			info = kmem_cache_zalloc(btrfs_free_space_cachep,
						 GFP_NOFS);
2234
			if (!info) {
2235
				spin_lock(&ctl->tree_lock);
2236 2237 2238 2239 2240 2241
				ret = -ENOMEM;
				goto out;
			}
		}

		/* allocate the bitmap */
2242 2243
		info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep,
						 GFP_NOFS);
2244
		info->trim_state = BTRFS_TRIM_STATE_TRIMMED;
2245
		spin_lock(&ctl->tree_lock);
2246 2247 2248 2249 2250 2251 2252 2253 2254
		if (!info->bitmap) {
			ret = -ENOMEM;
			goto out;
		}
		goto again;
	}

out:
	if (info) {
2255 2256 2257
		if (info->bitmap)
			kmem_cache_free(btrfs_free_space_bitmap_cachep,
					info->bitmap);
2258
		kmem_cache_free(btrfs_free_space_cachep, info);
2259
	}
J
Josef Bacik 已提交
2260 2261 2262 2263

	return ret;
}

2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279
/*
 * Free space merging rules:
 *  1) Merge trimmed areas together
 *  2) Let untrimmed areas coalesce with trimmed areas
 *  3) Always pull neighboring regions from bitmaps
 *
 * The above rules are for when we merge free space based on btrfs_trim_state.
 * Rules 2 and 3 are subtle because they are suboptimal, but are done for the
 * same reason: to promote larger extent regions which makes life easier for
 * find_free_extent().  Rule 2 enables coalescing based on the common path
 * being returning free space from btrfs_finish_extent_commit().  So when free
 * space is trimmed, it will prevent aggregating trimmed new region and
 * untrimmed regions in the rb_tree.  Rule 3 is purely to obtain larger extents
 * and provide find_free_extent() with the largest extents possible hoping for
 * the reuse path.
 */
2280
static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
2281
			  struct btrfs_free_space *info, bool update_stat)
J
Josef Bacik 已提交
2282
{
2283
	struct btrfs_free_space *left_info = NULL;
2284 2285 2286 2287
	struct btrfs_free_space *right_info;
	bool merged = false;
	u64 offset = info->offset;
	u64 bytes = info->bytes;
2288
	const bool is_trimmed = btrfs_free_space_trimmed(info);
2289

J
Josef Bacik 已提交
2290 2291 2292 2293 2294
	/*
	 * first we want to see if there is free space adjacent to the range we
	 * are adding, if there is remove that struct and add a new one to
	 * cover the entire range
	 */
2295
	right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
2296 2297 2298
	if (right_info && rb_prev(&right_info->offset_index))
		left_info = rb_entry(rb_prev(&right_info->offset_index),
				     struct btrfs_free_space, offset_index);
2299
	else if (!right_info)
2300
		left_info = tree_search_offset(ctl, offset - 1, 0, 0);
J
Josef Bacik 已提交
2301

2302 2303 2304
	/* See try_merge_free_space() comment. */
	if (right_info && !right_info->bitmap &&
	    (!is_trimmed || btrfs_free_space_trimmed(right_info))) {
2305
		if (update_stat)
2306
			unlink_free_space(ctl, right_info);
2307
		else
2308
			__unlink_free_space(ctl, right_info);
2309
		info->bytes += right_info->bytes;
2310
		kmem_cache_free(btrfs_free_space_cachep, right_info);
2311
		merged = true;
J
Josef Bacik 已提交
2312 2313
	}

2314
	/* See try_merge_free_space() comment. */
2315
	if (left_info && !left_info->bitmap &&
2316 2317
	    left_info->offset + left_info->bytes == offset &&
	    (!is_trimmed || btrfs_free_space_trimmed(left_info))) {
2318
		if (update_stat)
2319
			unlink_free_space(ctl, left_info);
2320
		else
2321
			__unlink_free_space(ctl, left_info);
2322 2323
		info->offset = left_info->offset;
		info->bytes += left_info->bytes;
2324
		kmem_cache_free(btrfs_free_space_cachep, left_info);
2325
		merged = true;
J
Josef Bacik 已提交
2326 2327
	}

2328 2329 2330
	return merged;
}

2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352
static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
				     struct btrfs_free_space *info,
				     bool update_stat)
{
	struct btrfs_free_space *bitmap;
	unsigned long i;
	unsigned long j;
	const u64 end = info->offset + info->bytes;
	const u64 bitmap_offset = offset_to_bitmap(ctl, end);
	u64 bytes;

	bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
	if (!bitmap)
		return false;

	i = offset_to_bit(bitmap->offset, ctl->unit, end);
	j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
	if (j == i)
		return false;
	bytes = (j - i) * ctl->unit;
	info->bytes += bytes;

2353 2354 2355 2356
	/* See try_merge_free_space() comment. */
	if (!btrfs_free_space_trimmed(bitmap))
		info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;

2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409
	if (update_stat)
		bitmap_clear_bits(ctl, bitmap, end, bytes);
	else
		__bitmap_clear_bits(ctl, bitmap, end, bytes);

	if (!bitmap->bytes)
		free_bitmap(ctl, bitmap);

	return true;
}

static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
				       struct btrfs_free_space *info,
				       bool update_stat)
{
	struct btrfs_free_space *bitmap;
	u64 bitmap_offset;
	unsigned long i;
	unsigned long j;
	unsigned long prev_j;
	u64 bytes;

	bitmap_offset = offset_to_bitmap(ctl, info->offset);
	/* If we're on a boundary, try the previous logical bitmap. */
	if (bitmap_offset == info->offset) {
		if (info->offset == 0)
			return false;
		bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
	}

	bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
	if (!bitmap)
		return false;

	i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
	j = 0;
	prev_j = (unsigned long)-1;
	for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
		if (j > i)
			break;
		prev_j = j;
	}
	if (prev_j == i)
		return false;

	if (prev_j == (unsigned long)-1)
		bytes = (i + 1) * ctl->unit;
	else
		bytes = (i - prev_j) * ctl->unit;

	info->offset -= bytes;
	info->bytes += bytes;

2410 2411 2412 2413
	/* See try_merge_free_space() comment. */
	if (!btrfs_free_space_trimmed(bitmap))
		info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;

2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460
	if (update_stat)
		bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
	else
		__bitmap_clear_bits(ctl, bitmap, info->offset, bytes);

	if (!bitmap->bytes)
		free_bitmap(ctl, bitmap);

	return true;
}

/*
 * We prefer always to allocate from extent entries, both for clustered and
 * non-clustered allocation requests. So when attempting to add a new extent
 * entry, try to see if there's adjacent free space in bitmap entries, and if
 * there is, migrate that space from the bitmaps to the extent.
 * Like this we get better chances of satisfying space allocation requests
 * because we attempt to satisfy them based on a single cache entry, and never
 * on 2 or more entries - even if the entries represent a contiguous free space
 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
 * ends).
 */
static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
			      struct btrfs_free_space *info,
			      bool update_stat)
{
	/*
	 * Only work with disconnected entries, as we can change their offset,
	 * and must be extent entries.
	 */
	ASSERT(!info->bitmap);
	ASSERT(RB_EMPTY_NODE(&info->offset_index));

	if (ctl->total_bitmaps > 0) {
		bool stole_end;
		bool stole_front = false;

		stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
		if (ctl->total_bitmaps > 0)
			stole_front = steal_from_bitmap_to_front(ctl, info,
								 update_stat);

		if (stole_end || stole_front)
			try_merge_free_space(ctl, info, update_stat);
	}
}

2461 2462
int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
			   struct btrfs_free_space_ctl *ctl,
2463 2464
			   u64 offset, u64 bytes,
			   enum btrfs_trim_state trim_state)
2465
{
2466
	struct btrfs_block_group *block_group = ctl->private;
2467 2468
	struct btrfs_free_space *info;
	int ret = 0;
D
Dennis Zhou 已提交
2469
	u64 filter_bytes = bytes;
2470

2471
	info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
2472 2473 2474 2475 2476
	if (!info)
		return -ENOMEM;

	info->offset = offset;
	info->bytes = bytes;
2477
	info->trim_state = trim_state;
2478
	RB_CLEAR_NODE(&info->offset_index);
2479

2480
	spin_lock(&ctl->tree_lock);
2481

2482
	if (try_merge_free_space(ctl, info, true))
2483 2484 2485 2486 2487 2488 2489
		goto link;

	/*
	 * There was no extent directly to the left or right of this new
	 * extent then we know we're going to have to allocate a new extent, so
	 * before we do that see if we need to drop this into a bitmap
	 */
2490
	ret = insert_into_bitmap(ctl, info);
2491 2492 2493 2494 2495 2496 2497
	if (ret < 0) {
		goto out;
	} else if (ret) {
		ret = 0;
		goto out;
	}
link:
2498 2499 2500 2501 2502 2503 2504 2505
	/*
	 * Only steal free space from adjacent bitmaps if we're sure we're not
	 * going to add the new free space to existing bitmap entries - because
	 * that would mean unnecessary work that would be reverted. Therefore
	 * attempt to steal space from bitmaps if we're adding an extent entry.
	 */
	steal_from_bitmap(ctl, info, true);

D
Dennis Zhou 已提交
2506 2507
	filter_bytes = max(filter_bytes, info->bytes);

2508
	ret = link_free_space(ctl, info);
J
Josef Bacik 已提交
2509
	if (ret)
2510
		kmem_cache_free(btrfs_free_space_cachep, info);
2511
out:
2512
	btrfs_discard_update_discardable(block_group);
2513
	spin_unlock(&ctl->tree_lock);
2514

J
Josef Bacik 已提交
2515
	if (ret) {
2516
		btrfs_crit(fs_info, "unable to add free space :%d", ret);
2517
		ASSERT(ret != -EEXIST);
J
Josef Bacik 已提交
2518 2519
	}

D
Dennis Zhou 已提交
2520 2521
	if (trim_state != BTRFS_TRIM_STATE_TRIMMED) {
		btrfs_discard_check_filter(block_group, filter_bytes);
2522
		btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
D
Dennis Zhou 已提交
2523
	}
2524

J
Josef Bacik 已提交
2525 2526 2527
	return ret;
}

2528
int btrfs_add_free_space(struct btrfs_block_group *block_group,
2529 2530
			 u64 bytenr, u64 size)
{
2531 2532 2533 2534 2535
	enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;

	if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC))
		trim_state = BTRFS_TRIM_STATE_TRIMMED;

2536 2537
	return __btrfs_add_free_space(block_group->fs_info,
				      block_group->free_space_ctl,
2538
				      bytenr, size, trim_state);
2539 2540
}

2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559
/*
 * This is a subtle distinction because when adding free space back in general,
 * we want it to be added as untrimmed for async. But in the case where we add
 * it on loading of a block group, we want to consider it trimmed.
 */
int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
				       u64 bytenr, u64 size)
{
	enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;

	if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) ||
	    btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
		trim_state = BTRFS_TRIM_STATE_TRIMMED;

	return __btrfs_add_free_space(block_group->fs_info,
				      block_group->free_space_ctl,
				      bytenr, size, trim_state);
}

2560
int btrfs_remove_free_space(struct btrfs_block_group *block_group,
2561
			    u64 offset, u64 bytes)
J
Josef Bacik 已提交
2562
{
2563
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
J
Josef Bacik 已提交
2564
	struct btrfs_free_space *info;
2565 2566
	int ret;
	bool re_search = false;
J
Josef Bacik 已提交
2567

2568
	spin_lock(&ctl->tree_lock);
2569

2570
again:
2571
	ret = 0;
2572 2573 2574
	if (!bytes)
		goto out_lock;

2575
	info = tree_search_offset(ctl, offset, 0, 0);
2576
	if (!info) {
2577 2578 2579 2580
		/*
		 * oops didn't find an extent that matched the space we wanted
		 * to remove, look for a bitmap instead
		 */
2581
		info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2582 2583
					  1, 0);
		if (!info) {
2584 2585 2586 2587
			/*
			 * If we found a partial bit of our free space in a
			 * bitmap but then couldn't find the other part this may
			 * be a problem, so WARN about it.
2588
			 */
2589
			WARN_ON(re_search);
2590 2591
			goto out_lock;
		}
2592 2593
	}

2594
	re_search = false;
2595
	if (!info->bitmap) {
2596
		unlink_free_space(ctl, info);
2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607
		if (offset == info->offset) {
			u64 to_free = min(bytes, info->bytes);

			info->bytes -= to_free;
			info->offset += to_free;
			if (info->bytes) {
				ret = link_free_space(ctl, info);
				WARN_ON(ret);
			} else {
				kmem_cache_free(btrfs_free_space_cachep, info);
			}
J
Josef Bacik 已提交
2608

2609 2610 2611 2612 2613
			offset += to_free;
			bytes -= to_free;
			goto again;
		} else {
			u64 old_end = info->bytes + info->offset;
2614

2615
			info->bytes = offset - info->offset;
2616
			ret = link_free_space(ctl, info);
2617 2618 2619 2620
			WARN_ON(ret);
			if (ret)
				goto out_lock;

2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631
			/* Not enough bytes in this entry to satisfy us */
			if (old_end < offset + bytes) {
				bytes -= old_end - offset;
				offset = old_end;
				goto again;
			} else if (old_end == offset + bytes) {
				/* all done */
				goto out_lock;
			}
			spin_unlock(&ctl->tree_lock);

2632 2633 2634 2635
			ret = __btrfs_add_free_space(block_group->fs_info, ctl,
						     offset + bytes,
						     old_end - (offset + bytes),
						     info->trim_state);
2636 2637 2638
			WARN_ON(ret);
			goto out;
		}
J
Josef Bacik 已提交
2639
	}
2640

2641
	ret = remove_from_bitmap(ctl, info, &offset, &bytes);
2642 2643
	if (ret == -EAGAIN) {
		re_search = true;
2644
		goto again;
2645
	}
2646
out_lock:
2647
	btrfs_discard_update_discardable(block_group);
2648
	spin_unlock(&ctl->tree_lock);
J
Josef Bacik 已提交
2649
out:
2650 2651 2652
	return ret;
}

2653
void btrfs_dump_free_space(struct btrfs_block_group *block_group,
J
Josef Bacik 已提交
2654 2655
			   u64 bytes)
{
2656
	struct btrfs_fs_info *fs_info = block_group->fs_info;
2657
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
J
Josef Bacik 已提交
2658 2659 2660 2661
	struct btrfs_free_space *info;
	struct rb_node *n;
	int count = 0;

2662
	spin_lock(&ctl->tree_lock);
2663
	for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
J
Josef Bacik 已提交
2664
		info = rb_entry(n, struct btrfs_free_space, offset_index);
L
Liu Bo 已提交
2665
		if (info->bytes >= bytes && !block_group->ro)
J
Josef Bacik 已提交
2666
			count++;
2667
		btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
2668
			   info->offset, info->bytes,
2669
		       (info->bitmap) ? "yes" : "no");
J
Josef Bacik 已提交
2670
	}
2671
	spin_unlock(&ctl->tree_lock);
2672
	btrfs_info(fs_info, "block group has cluster?: %s",
2673
	       list_empty(&block_group->cluster_list) ? "no" : "yes");
2674
	btrfs_info(fs_info,
2675
		   "%d blocks of free space at or bigger than bytes is", count);
J
Josef Bacik 已提交
2676 2677
}

2678 2679
void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
			       struct btrfs_free_space_ctl *ctl)
J
Josef Bacik 已提交
2680
{
2681
	struct btrfs_fs_info *fs_info = block_group->fs_info;
J
Josef Bacik 已提交
2682

2683
	spin_lock_init(&ctl->tree_lock);
2684
	ctl->unit = fs_info->sectorsize;
2685
	ctl->start = block_group->start;
2686 2687
	ctl->private = block_group;
	ctl->op = &free_space_op;
2688 2689
	INIT_LIST_HEAD(&ctl->trimming_ranges);
	mutex_init(&ctl->cache_writeout_mutex);
J
Josef Bacik 已提交
2690

2691 2692 2693 2694 2695
	/*
	 * we only want to have 32k of ram per block group for keeping
	 * track of free space, and if we pass 1/2 of that we want to
	 * start converting things over to using bitmaps
	 */
2696
	ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
J
Josef Bacik 已提交
2697 2698
}

2699 2700 2701 2702 2703 2704
/*
 * for a given cluster, put all of its extents back into the free
 * space cache.  If the block group passed doesn't match the block group
 * pointed to by the cluster, someone else raced in and freed the
 * cluster already.  In that case, we just return without changing anything
 */
2705
static void __btrfs_return_cluster_to_free_space(
2706
			     struct btrfs_block_group *block_group,
2707 2708
			     struct btrfs_free_cluster *cluster)
{
2709
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2710 2711 2712 2713 2714 2715 2716
	struct btrfs_free_space *entry;
	struct rb_node *node;

	spin_lock(&cluster->lock);
	if (cluster->block_group != block_group)
		goto out;

2717
	cluster->block_group = NULL;
2718
	cluster->window_start = 0;
2719 2720
	list_del_init(&cluster->block_group_list);

2721
	node = rb_first(&cluster->root);
2722
	while (node) {
2723 2724
		bool bitmap;

2725 2726 2727
		entry = rb_entry(node, struct btrfs_free_space, offset_index);
		node = rb_next(&entry->offset_index);
		rb_erase(&entry->offset_index, &cluster->root);
2728
		RB_CLEAR_NODE(&entry->offset_index);
2729 2730

		bitmap = (entry->bitmap != NULL);
2731
		if (!bitmap) {
2732
			/* Merging treats extents as if they were new */
2733
			if (!btrfs_free_space_trimmed(entry)) {
2734
				ctl->discardable_extents[BTRFS_STAT_CURR]--;
2735 2736 2737
				ctl->discardable_bytes[BTRFS_STAT_CURR] -=
					entry->bytes;
			}
2738

2739
			try_merge_free_space(ctl, entry, false);
2740
			steal_from_bitmap(ctl, entry, false);
2741 2742

			/* As we insert directly, update these statistics */
2743
			if (!btrfs_free_space_trimmed(entry)) {
2744
				ctl->discardable_extents[BTRFS_STAT_CURR]++;
2745 2746 2747
				ctl->discardable_bytes[BTRFS_STAT_CURR] +=
					entry->bytes;
			}
2748
		}
2749
		tree_insert_offset(&ctl->free_space_offset,
2750
				   entry->offset, &entry->offset_index, bitmap);
2751
	}
2752
	cluster->root = RB_ROOT;
2753

2754 2755
out:
	spin_unlock(&cluster->lock);
2756
	btrfs_put_block_group(block_group);
2757 2758
}

2759 2760
static void __btrfs_remove_free_space_cache_locked(
				struct btrfs_free_space_ctl *ctl)
J
Josef Bacik 已提交
2761 2762 2763
{
	struct btrfs_free_space *info;
	struct rb_node *node;
2764 2765 2766

	while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
		info = rb_entry(node, struct btrfs_free_space, offset_index);
2767 2768 2769 2770 2771 2772
		if (!info->bitmap) {
			unlink_free_space(ctl, info);
			kmem_cache_free(btrfs_free_space_cachep, info);
		} else {
			free_bitmap(ctl, info);
		}
2773 2774

		cond_resched_lock(&ctl->tree_lock);
2775
	}
2776 2777 2778 2779 2780 2781
}

void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
{
	spin_lock(&ctl->tree_lock);
	__btrfs_remove_free_space_cache_locked(ctl);
2782
	if (ctl->private)
2783
		btrfs_discard_update_discardable(ctl->private);
2784 2785 2786
	spin_unlock(&ctl->tree_lock);
}

2787
void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
2788 2789
{
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2790
	struct btrfs_free_cluster *cluster;
2791
	struct list_head *head;
J
Josef Bacik 已提交
2792

2793
	spin_lock(&ctl->tree_lock);
2794 2795 2796 2797
	while ((head = block_group->cluster_list.next) !=
	       &block_group->cluster_list) {
		cluster = list_entry(head, struct btrfs_free_cluster,
				     block_group_list);
2798 2799 2800

		WARN_ON(cluster->block_group != block_group);
		__btrfs_return_cluster_to_free_space(block_group, cluster);
2801 2802

		cond_resched_lock(&ctl->tree_lock);
2803
	}
2804
	__btrfs_remove_free_space_cache_locked(ctl);
2805
	btrfs_discard_update_discardable(block_group);
2806
	spin_unlock(&ctl->tree_lock);
2807

J
Josef Bacik 已提交
2808 2809
}

2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840
/**
 * btrfs_is_free_space_trimmed - see if everything is trimmed
 * @block_group: block_group of interest
 *
 * Walk @block_group's free space rb_tree to determine if everything is trimmed.
 */
bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
{
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
	struct btrfs_free_space *info;
	struct rb_node *node;
	bool ret = true;

	spin_lock(&ctl->tree_lock);
	node = rb_first(&ctl->free_space_offset);

	while (node) {
		info = rb_entry(node, struct btrfs_free_space, offset_index);

		if (!btrfs_free_space_trimmed(info)) {
			ret = false;
			break;
		}

		node = rb_next(node);
	}

	spin_unlock(&ctl->tree_lock);
	return ret;
}

2841
u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
2842 2843
			       u64 offset, u64 bytes, u64 empty_size,
			       u64 *max_extent_size)
J
Josef Bacik 已提交
2844
{
2845
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2846 2847
	struct btrfs_discard_ctl *discard_ctl =
					&block_group->fs_info->discard_ctl;
2848
	struct btrfs_free_space *entry = NULL;
2849
	u64 bytes_search = bytes + empty_size;
2850
	u64 ret = 0;
D
David Woodhouse 已提交
2851 2852
	u64 align_gap = 0;
	u64 align_gap_len = 0;
2853
	enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
J
Josef Bacik 已提交
2854

2855
	spin_lock(&ctl->tree_lock);
D
David Woodhouse 已提交
2856
	entry = find_free_space(ctl, &offset, &bytes_search,
2857
				block_group->full_stripe_len, max_extent_size);
2858
	if (!entry)
2859 2860 2861 2862
		goto out;

	ret = offset;
	if (entry->bitmap) {
2863
		bitmap_clear_bits(ctl, entry, offset, bytes);
2864 2865 2866 2867

		if (!btrfs_free_space_trimmed(entry))
			atomic64_add(bytes, &discard_ctl->discard_bytes_saved);

2868
		if (!entry->bytes)
2869
			free_bitmap(ctl, entry);
2870
	} else {
2871
		unlink_free_space(ctl, entry);
D
David Woodhouse 已提交
2872 2873
		align_gap_len = offset - entry->offset;
		align_gap = entry->offset;
2874
		align_gap_trim_state = entry->trim_state;
D
David Woodhouse 已提交
2875

2876 2877 2878
		if (!btrfs_free_space_trimmed(entry))
			atomic64_add(bytes, &discard_ctl->discard_bytes_saved);

D
David Woodhouse 已提交
2879 2880 2881 2882
		entry->offset = offset + bytes;
		WARN_ON(entry->bytes < bytes + align_gap_len);

		entry->bytes -= bytes + align_gap_len;
2883
		if (!entry->bytes)
2884
			kmem_cache_free(btrfs_free_space_cachep, entry);
2885
		else
2886
			link_free_space(ctl, entry);
2887
	}
2888
out:
2889
	btrfs_discard_update_discardable(block_group);
2890
	spin_unlock(&ctl->tree_lock);
J
Josef Bacik 已提交
2891

D
David Woodhouse 已提交
2892
	if (align_gap_len)
2893
		__btrfs_add_free_space(block_group->fs_info, ctl,
2894 2895
				       align_gap, align_gap_len,
				       align_gap_trim_state);
J
Josef Bacik 已提交
2896 2897
	return ret;
}
2898 2899 2900 2901 2902 2903 2904 2905 2906

/*
 * given a cluster, put all of its extents back into the free space
 * cache.  If a block group is passed, this function will only free
 * a cluster that belongs to the passed block group.
 *
 * Otherwise, it'll get a reference on the block group pointed to by the
 * cluster and remove the cluster from it.
 */
2907
void btrfs_return_cluster_to_free_space(
2908
			       struct btrfs_block_group *block_group,
2909 2910
			       struct btrfs_free_cluster *cluster)
{
2911
	struct btrfs_free_space_ctl *ctl;
2912 2913 2914 2915 2916 2917 2918

	/* first, get a safe pointer to the block group */
	spin_lock(&cluster->lock);
	if (!block_group) {
		block_group = cluster->block_group;
		if (!block_group) {
			spin_unlock(&cluster->lock);
2919
			return;
2920 2921 2922 2923
		}
	} else if (cluster->block_group != block_group) {
		/* someone else has already freed it don't redo their work */
		spin_unlock(&cluster->lock);
2924
		return;
2925
	}
2926
	btrfs_get_block_group(block_group);
2927 2928
	spin_unlock(&cluster->lock);

2929 2930
	ctl = block_group->free_space_ctl;

2931
	/* now return any extents the cluster had on it */
2932
	spin_lock(&ctl->tree_lock);
2933
	__btrfs_return_cluster_to_free_space(block_group, cluster);
2934
	spin_unlock(&ctl->tree_lock);
2935

2936 2937
	btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group);

2938 2939 2940 2941
	/* finally drop our ref */
	btrfs_put_block_group(block_group);
}

2942
static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
2943
				   struct btrfs_free_cluster *cluster,
2944
				   struct btrfs_free_space *entry,
2945 2946
				   u64 bytes, u64 min_start,
				   u64 *max_extent_size)
2947
{
2948
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2949 2950 2951 2952 2953 2954 2955 2956
	int err;
	u64 search_start = cluster->window_start;
	u64 search_bytes = bytes;
	u64 ret = 0;

	search_start = min_start;
	search_bytes = bytes;

2957
	err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
2958
	if (err) {
J
Josef Bacik 已提交
2959 2960
		*max_extent_size = max(get_max_extent_size(entry),
				       *max_extent_size);
2961
		return 0;
2962
	}
2963 2964

	ret = search_start;
2965
	__bitmap_clear_bits(ctl, entry, ret, bytes);
2966 2967 2968 2969

	return ret;
}

2970 2971 2972 2973 2974
/*
 * given a cluster, try to allocate 'bytes' from it, returns 0
 * if it couldn't find anything suitably large, or a logical disk offset
 * if things worked out
 */
2975
u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
2976
			     struct btrfs_free_cluster *cluster, u64 bytes,
2977
			     u64 min_start, u64 *max_extent_size)
2978
{
2979
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2980 2981
	struct btrfs_discard_ctl *discard_ctl =
					&block_group->fs_info->discard_ctl;
2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997
	struct btrfs_free_space *entry = NULL;
	struct rb_node *node;
	u64 ret = 0;

	spin_lock(&cluster->lock);
	if (bytes > cluster->max_size)
		goto out;

	if (cluster->block_group != block_group)
		goto out;

	node = rb_first(&cluster->root);
	if (!node)
		goto out;

	entry = rb_entry(node, struct btrfs_free_space, offset_index);
2998
	while (1) {
J
Josef Bacik 已提交
2999 3000 3001
		if (entry->bytes < bytes)
			*max_extent_size = max(get_max_extent_size(entry),
					       *max_extent_size);
3002

3003 3004
		if (entry->bytes < bytes ||
		    (!entry->bitmap && entry->offset < min_start)) {
3005 3006 3007 3008 3009 3010 3011 3012
			node = rb_next(&entry->offset_index);
			if (!node)
				break;
			entry = rb_entry(node, struct btrfs_free_space,
					 offset_index);
			continue;
		}

3013 3014 3015
		if (entry->bitmap) {
			ret = btrfs_alloc_from_bitmap(block_group,
						      cluster, entry, bytes,
3016 3017
						      cluster->window_start,
						      max_extent_size);
3018 3019 3020 3021 3022 3023 3024 3025
			if (ret == 0) {
				node = rb_next(&entry->offset_index);
				if (!node)
					break;
				entry = rb_entry(node, struct btrfs_free_space,
						 offset_index);
				continue;
			}
3026
			cluster->window_start += bytes;
3027 3028 3029 3030 3031 3032
		} else {
			ret = entry->offset;

			entry->offset += bytes;
			entry->bytes -= bytes;
		}
3033

3034
		if (entry->bytes == 0)
3035 3036 3037 3038 3039
			rb_erase(&entry->offset_index, &cluster->root);
		break;
	}
out:
	spin_unlock(&cluster->lock);
3040

3041 3042 3043
	if (!ret)
		return 0;

3044
	spin_lock(&ctl->tree_lock);
3045

3046 3047 3048
	if (!btrfs_free_space_trimmed(entry))
		atomic64_add(bytes, &discard_ctl->discard_bytes_saved);

3049
	ctl->free_space -= bytes;
3050 3051
	if (!entry->bitmap && !btrfs_free_space_trimmed(entry))
		ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
3052
	if (entry->bytes == 0) {
3053
		ctl->free_extents--;
3054
		if (entry->bitmap) {
3055 3056
			kmem_cache_free(btrfs_free_space_bitmap_cachep,
					entry->bitmap);
3057
			ctl->total_bitmaps--;
3058
			recalculate_thresholds(ctl);
3059 3060
		} else if (!btrfs_free_space_trimmed(entry)) {
			ctl->discardable_extents[BTRFS_STAT_CURR]--;
3061
		}
3062
		kmem_cache_free(btrfs_free_space_cachep, entry);
3063 3064
	}

3065
	spin_unlock(&ctl->tree_lock);
3066

3067 3068 3069
	return ret;
}

3070
static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
3071 3072
				struct btrfs_free_space *entry,
				struct btrfs_free_cluster *cluster,
3073 3074
				u64 offset, u64 bytes,
				u64 cont1_bytes, u64 min_bytes)
3075
{
3076
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3077 3078
	unsigned long next_zero;
	unsigned long i;
3079 3080
	unsigned long want_bits;
	unsigned long min_bits;
3081
	unsigned long found_bits;
3082
	unsigned long max_bits = 0;
3083 3084
	unsigned long start = 0;
	unsigned long total_found = 0;
3085
	int ret;
3086

3087
	i = offset_to_bit(entry->offset, ctl->unit,
3088
			  max_t(u64, offset, entry->offset));
3089 3090
	want_bits = bytes_to_bits(bytes, ctl->unit);
	min_bits = bytes_to_bits(min_bytes, ctl->unit);
3091

3092 3093 3094 3095 3096 3097 3098
	/*
	 * Don't bother looking for a cluster in this bitmap if it's heavily
	 * fragmented.
	 */
	if (entry->max_extent_size &&
	    entry->max_extent_size < cont1_bytes)
		return -ENOSPC;
3099 3100
again:
	found_bits = 0;
3101
	for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
3102 3103
		next_zero = find_next_zero_bit(entry->bitmap,
					       BITS_PER_BITMAP, i);
3104
		if (next_zero - i >= min_bits) {
3105
			found_bits = next_zero - i;
3106 3107
			if (found_bits > max_bits)
				max_bits = found_bits;
3108 3109
			break;
		}
3110 3111
		if (next_zero - i > max_bits)
			max_bits = next_zero - i;
3112 3113 3114
		i = next_zero;
	}

3115 3116
	if (!found_bits) {
		entry->max_extent_size = (u64)max_bits * ctl->unit;
3117
		return -ENOSPC;
3118
	}
3119

3120
	if (!total_found) {
3121
		start = i;
3122
		cluster->max_size = 0;
3123 3124 3125 3126
	}

	total_found += found_bits;

3127 3128
	if (cluster->max_size < found_bits * ctl->unit)
		cluster->max_size = found_bits * ctl->unit;
3129

3130 3131
	if (total_found < want_bits || cluster->max_size < cont1_bytes) {
		i = next_zero + 1;
3132 3133 3134
		goto again;
	}

3135
	cluster->window_start = start * ctl->unit + entry->offset;
3136
	rb_erase(&entry->offset_index, &ctl->free_space_offset);
3137 3138
	ret = tree_insert_offset(&cluster->root, entry->offset,
				 &entry->offset_index, 1);
3139
	ASSERT(!ret); /* -EEXIST; Logic error */
3140

J
Josef Bacik 已提交
3141
	trace_btrfs_setup_cluster(block_group, cluster,
3142
				  total_found * ctl->unit, 1);
3143 3144 3145
	return 0;
}

3146 3147
/*
 * This searches the block group for just extents to fill the cluster with.
3148 3149
 * Try to find a cluster with at least bytes total bytes, at least one
 * extent of cont1_bytes, and other clusters of at least min_bytes.
3150
 */
3151
static noinline int
3152
setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
3153 3154
			struct btrfs_free_cluster *cluster,
			struct list_head *bitmaps, u64 offset, u64 bytes,
3155
			u64 cont1_bytes, u64 min_bytes)
3156
{
3157
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3158 3159 3160 3161 3162 3163
	struct btrfs_free_space *first = NULL;
	struct btrfs_free_space *entry = NULL;
	struct btrfs_free_space *last;
	struct rb_node *node;
	u64 window_free;
	u64 max_extent;
J
Josef Bacik 已提交
3164
	u64 total_size = 0;
3165

3166
	entry = tree_search_offset(ctl, offset, 0, 1);
3167 3168 3169 3170 3171 3172 3173
	if (!entry)
		return -ENOSPC;

	/*
	 * We don't want bitmaps, so just move along until we find a normal
	 * extent entry.
	 */
3174 3175
	while (entry->bitmap || entry->bytes < min_bytes) {
		if (entry->bitmap && list_empty(&entry->list))
3176
			list_add_tail(&entry->list, bitmaps);
3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187
		node = rb_next(&entry->offset_index);
		if (!node)
			return -ENOSPC;
		entry = rb_entry(node, struct btrfs_free_space, offset_index);
	}

	window_free = entry->bytes;
	max_extent = entry->bytes;
	first = entry;
	last = entry;

3188 3189
	for (node = rb_next(&entry->offset_index); node;
	     node = rb_next(&entry->offset_index)) {
3190 3191
		entry = rb_entry(node, struct btrfs_free_space, offset_index);

3192 3193 3194
		if (entry->bitmap) {
			if (list_empty(&entry->list))
				list_add_tail(&entry->list, bitmaps);
3195
			continue;
3196 3197
		}

3198 3199 3200 3201 3202 3203
		if (entry->bytes < min_bytes)
			continue;

		last = entry;
		window_free += entry->bytes;
		if (entry->bytes > max_extent)
3204 3205 3206
			max_extent = entry->bytes;
	}

3207 3208 3209
	if (window_free < bytes || max_extent < cont1_bytes)
		return -ENOSPC;

3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222
	cluster->window_start = first->offset;

	node = &first->offset_index;

	/*
	 * now we've found our entries, pull them out of the free space
	 * cache and put them into the cluster rbtree
	 */
	do {
		int ret;

		entry = rb_entry(node, struct btrfs_free_space, offset_index);
		node = rb_next(&entry->offset_index);
3223
		if (entry->bitmap || entry->bytes < min_bytes)
3224 3225
			continue;

3226
		rb_erase(&entry->offset_index, &ctl->free_space_offset);
3227 3228
		ret = tree_insert_offset(&cluster->root, entry->offset,
					 &entry->offset_index, 0);
J
Josef Bacik 已提交
3229
		total_size += entry->bytes;
3230
		ASSERT(!ret); /* -EEXIST; Logic error */
3231 3232 3233
	} while (node && entry != last);

	cluster->max_size = max_extent;
J
Josef Bacik 已提交
3234
	trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
3235 3236 3237 3238 3239 3240 3241
	return 0;
}

/*
 * This specifically looks for bitmaps that may work in the cluster, we assume
 * that we have already failed to find extents that will work.
 */
3242
static noinline int
3243
setup_cluster_bitmap(struct btrfs_block_group *block_group,
3244 3245
		     struct btrfs_free_cluster *cluster,
		     struct list_head *bitmaps, u64 offset, u64 bytes,
3246
		     u64 cont1_bytes, u64 min_bytes)
3247
{
3248
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3249
	struct btrfs_free_space *entry = NULL;
3250
	int ret = -ENOSPC;
3251
	u64 bitmap_offset = offset_to_bitmap(ctl, offset);
3252

3253
	if (ctl->total_bitmaps == 0)
3254 3255
		return -ENOSPC;

3256 3257 3258 3259
	/*
	 * The bitmap that covers offset won't be in the list unless offset
	 * is just its start offset.
	 */
3260 3261 3262 3263
	if (!list_empty(bitmaps))
		entry = list_first_entry(bitmaps, struct btrfs_free_space, list);

	if (!entry || entry->offset != bitmap_offset) {
3264 3265 3266 3267 3268
		entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
		if (entry && list_empty(&entry->list))
			list_add(&entry->list, bitmaps);
	}

3269
	list_for_each_entry(entry, bitmaps, list) {
3270
		if (entry->bytes < bytes)
3271 3272
			continue;
		ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
3273
					   bytes, cont1_bytes, min_bytes);
3274 3275 3276 3277 3278
		if (!ret)
			return 0;
	}

	/*
3279 3280
	 * The bitmaps list has all the bitmaps that record free space
	 * starting after offset, so no more search is required.
3281
	 */
3282
	return -ENOSPC;
3283 3284
}

3285 3286
/*
 * here we try to find a cluster of blocks in a block group.  The goal
3287
 * is to find at least bytes+empty_size.
3288 3289 3290 3291 3292
 * We might not find them all in one contiguous area.
 *
 * returns zero and sets up cluster if things worked out, otherwise
 * it returns -enospc
 */
3293
int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
3294 3295 3296
			     struct btrfs_free_cluster *cluster,
			     u64 offset, u64 bytes, u64 empty_size)
{
3297
	struct btrfs_fs_info *fs_info = block_group->fs_info;
3298
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3299
	struct btrfs_free_space *entry, *tmp;
3300
	LIST_HEAD(bitmaps);
3301
	u64 min_bytes;
3302
	u64 cont1_bytes;
3303 3304
	int ret;

3305 3306 3307 3308 3309 3310
	/*
	 * Choose the minimum extent size we'll require for this
	 * cluster.  For SSD_SPREAD, don't allow any fragmentation.
	 * For metadata, allow allocates with smaller extents.  For
	 * data, keep it dense.
	 */
3311
	if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
3312
		cont1_bytes = min_bytes = bytes + empty_size;
3313
	} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
3314
		cont1_bytes = bytes;
3315
		min_bytes = fs_info->sectorsize;
3316 3317
	} else {
		cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
3318
		min_bytes = fs_info->sectorsize;
3319
	}
3320

3321
	spin_lock(&ctl->tree_lock);
3322 3323 3324 3325 3326

	/*
	 * If we know we don't have enough space to make a cluster don't even
	 * bother doing all the work to try and find one.
	 */
3327
	if (ctl->free_space < bytes) {
3328
		spin_unlock(&ctl->tree_lock);
3329 3330 3331
		return -ENOSPC;
	}

3332 3333 3334 3335 3336 3337 3338 3339
	spin_lock(&cluster->lock);

	/* someone already found a cluster, hooray */
	if (cluster->block_group) {
		ret = 0;
		goto out;
	}

J
Josef Bacik 已提交
3340 3341 3342
	trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
				 min_bytes);

3343
	ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
3344 3345
				      bytes + empty_size,
				      cont1_bytes, min_bytes);
3346
	if (ret)
3347
		ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
3348 3349
					   offset, bytes + empty_size,
					   cont1_bytes, min_bytes);
3350 3351 3352 3353

	/* Clear our temporary list */
	list_for_each_entry_safe(entry, tmp, &bitmaps, list)
		list_del_init(&entry->list);
3354

3355
	if (!ret) {
3356
		btrfs_get_block_group(block_group);
3357 3358 3359
		list_add_tail(&cluster->block_group_list,
			      &block_group->cluster_list);
		cluster->block_group = block_group;
J
Josef Bacik 已提交
3360 3361
	} else {
		trace_btrfs_failed_cluster_setup(block_group);
3362 3363 3364
	}
out:
	spin_unlock(&cluster->lock);
3365
	spin_unlock(&ctl->tree_lock);
3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376

	return ret;
}

/*
 * simple code to zero out a cluster
 */
void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
{
	spin_lock_init(&cluster->lock);
	spin_lock_init(&cluster->refill_lock);
3377
	cluster->root = RB_ROOT;
3378
	cluster->max_size = 0;
3379
	cluster->fragmented = false;
3380 3381 3382 3383
	INIT_LIST_HEAD(&cluster->block_group_list);
	cluster->block_group = NULL;
}

3384
static int do_trimming(struct btrfs_block_group *block_group,
3385
		       u64 *total_trimmed, u64 start, u64 bytes,
3386
		       u64 reserved_start, u64 reserved_bytes,
3387
		       enum btrfs_trim_state reserved_trim_state,
3388
		       struct btrfs_trim_range *trim_entry)
3389
{
3390
	struct btrfs_space_info *space_info = block_group->space_info;
3391
	struct btrfs_fs_info *fs_info = block_group->fs_info;
3392
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3393 3394
	int ret;
	int update = 0;
3395 3396 3397
	const u64 end = start + bytes;
	const u64 reserved_end = reserved_start + reserved_bytes;
	enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3398
	u64 trimmed = 0;
3399

3400 3401 3402 3403 3404 3405 3406 3407 3408 3409
	spin_lock(&space_info->lock);
	spin_lock(&block_group->lock);
	if (!block_group->ro) {
		block_group->reserved += reserved_bytes;
		space_info->bytes_reserved += reserved_bytes;
		update = 1;
	}
	spin_unlock(&block_group->lock);
	spin_unlock(&space_info->lock);

3410
	ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
3411
	if (!ret) {
3412
		*total_trimmed += trimmed;
3413 3414
		trim_state = BTRFS_TRIM_STATE_TRIMMED;
	}
3415

3416
	mutex_lock(&ctl->cache_writeout_mutex);
3417 3418 3419 3420 3421 3422 3423 3424
	if (reserved_start < start)
		__btrfs_add_free_space(fs_info, ctl, reserved_start,
				       start - reserved_start,
				       reserved_trim_state);
	if (start + bytes < reserved_start + reserved_bytes)
		__btrfs_add_free_space(fs_info, ctl, end, reserved_end - end,
				       reserved_trim_state);
	__btrfs_add_free_space(fs_info, ctl, start, bytes, trim_state);
3425 3426
	list_del(&trim_entry->list);
	mutex_unlock(&ctl->cache_writeout_mutex);
3427 3428 3429 3430 3431 3432 3433 3434 3435

	if (update) {
		spin_lock(&space_info->lock);
		spin_lock(&block_group->lock);
		if (block_group->ro)
			space_info->bytes_readonly += reserved_bytes;
		block_group->reserved -= reserved_bytes;
		space_info->bytes_reserved -= reserved_bytes;
		spin_unlock(&block_group->lock);
3436
		spin_unlock(&space_info->lock);
3437 3438 3439 3440 3441
	}

	return ret;
}

3442 3443 3444
/*
 * If @async is set, then we will trim 1 region and return.
 */
3445
static int trim_no_bitmap(struct btrfs_block_group *block_group,
3446 3447
			  u64 *total_trimmed, u64 start, u64 end, u64 minlen,
			  bool async)
3448
{
3449 3450
	struct btrfs_discard_ctl *discard_ctl =
					&block_group->fs_info->discard_ctl;
3451 3452 3453 3454 3455 3456
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
	struct btrfs_free_space *entry;
	struct rb_node *node;
	int ret = 0;
	u64 extent_start;
	u64 extent_bytes;
3457
	enum btrfs_trim_state extent_trim_state;
3458
	u64 bytes;
3459
	const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
3460 3461

	while (start < end) {
3462 3463 3464
		struct btrfs_trim_range trim_entry;

		mutex_lock(&ctl->cache_writeout_mutex);
3465
		spin_lock(&ctl->tree_lock);
3466

3467 3468
		if (ctl->free_space < minlen)
			goto out_unlock;
3469

3470
		entry = tree_search_offset(ctl, start, 0, 1);
3471 3472
		if (!entry)
			goto out_unlock;
3473

3474 3475 3476
		/* Skip bitmaps and if async, already trimmed entries */
		while (entry->bitmap ||
		       (async && btrfs_free_space_trimmed(entry))) {
3477
			node = rb_next(&entry->offset_index);
3478 3479
			if (!node)
				goto out_unlock;
3480 3481
			entry = rb_entry(node, struct btrfs_free_space,
					 offset_index);
3482 3483
		}

3484 3485
		if (entry->offset >= end)
			goto out_unlock;
3486

3487 3488
		extent_start = entry->offset;
		extent_bytes = entry->bytes;
3489
		extent_trim_state = entry->trim_state;
3490 3491 3492 3493 3494 3495 3496 3497 3498
		if (async) {
			start = entry->offset;
			bytes = entry->bytes;
			if (bytes < minlen) {
				spin_unlock(&ctl->tree_lock);
				mutex_unlock(&ctl->cache_writeout_mutex);
				goto next;
			}
			unlink_free_space(ctl, entry);
D
Dennis Zhou 已提交
3499 3500 3501 3502 3503 3504 3505 3506
			/*
			 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
			 * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim
			 * X when we come back around.  So trim it now.
			 */
			if (max_discard_size &&
			    bytes >= (max_discard_size +
				      BTRFS_ASYNC_DISCARD_MIN_FILTER)) {
3507 3508 3509 3510
				bytes = max_discard_size;
				extent_bytes = max_discard_size;
				entry->offset += max_discard_size;
				entry->bytes -= max_discard_size;
3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522
				link_free_space(ctl, entry);
			} else {
				kmem_cache_free(btrfs_free_space_cachep, entry);
			}
		} else {
			start = max(start, extent_start);
			bytes = min(extent_start + extent_bytes, end) - start;
			if (bytes < minlen) {
				spin_unlock(&ctl->tree_lock);
				mutex_unlock(&ctl->cache_writeout_mutex);
				goto next;
			}
3523

3524 3525 3526
			unlink_free_space(ctl, entry);
			kmem_cache_free(btrfs_free_space_cachep, entry);
		}
3527

3528
		spin_unlock(&ctl->tree_lock);
3529 3530 3531 3532
		trim_entry.start = extent_start;
		trim_entry.bytes = extent_bytes;
		list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
		mutex_unlock(&ctl->cache_writeout_mutex);
3533

3534
		ret = do_trimming(block_group, total_trimmed, start, bytes,
3535 3536
				  extent_start, extent_bytes, extent_trim_state,
				  &trim_entry);
3537 3538
		if (ret) {
			block_group->discard_cursor = start + bytes;
3539
			break;
3540
		}
3541 3542
next:
		start += bytes;
3543 3544 3545
		block_group->discard_cursor = start;
		if (async && *total_trimmed)
			break;
3546

3547 3548 3549 3550 3551 3552 3553
		if (fatal_signal_pending(current)) {
			ret = -ERESTARTSYS;
			break;
		}

		cond_resched();
	}
3554 3555 3556 3557 3558 3559 3560 3561

	return ret;

out_unlock:
	block_group->discard_cursor = btrfs_block_group_end(block_group);
	spin_unlock(&ctl->tree_lock);
	mutex_unlock(&ctl->cache_writeout_mutex);

3562 3563 3564
	return ret;
}

3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584
/*
 * If we break out of trimming a bitmap prematurely, we should reset the
 * trimming bit.  In a rather contrieved case, it's possible to race here so
 * reset the state to BTRFS_TRIM_STATE_UNTRIMMED.
 *
 * start = start of bitmap
 * end = near end of bitmap
 *
 * Thread 1:			Thread 2:
 * trim_bitmaps(start)
 *				trim_bitmaps(end)
 *				end_trimming_bitmap()
 * reset_trimming_bitmap()
 */
static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset)
{
	struct btrfs_free_space *entry;

	spin_lock(&ctl->tree_lock);
	entry = tree_search_offset(ctl, offset, 1, 0);
3585
	if (entry) {
3586
		if (btrfs_free_space_trimmed(entry)) {
3587 3588
			ctl->discardable_extents[BTRFS_STAT_CURR] +=
				entry->bitmap_extents;
3589 3590
			ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes;
		}
3591
		entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3592 3593
	}

3594 3595 3596
	spin_unlock(&ctl->tree_lock);
}

3597 3598
static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl,
				struct btrfs_free_space *entry)
3599
{
3600
	if (btrfs_free_space_trimming_bitmap(entry)) {
3601
		entry->trim_state = BTRFS_TRIM_STATE_TRIMMED;
3602 3603
		ctl->discardable_extents[BTRFS_STAT_CURR] -=
			entry->bitmap_extents;
3604
		ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes;
3605
	}
3606 3607
}

3608 3609 3610
/*
 * If @async is set, then we will trim 1 region and return.
 */
3611
static int trim_bitmaps(struct btrfs_block_group *block_group,
3612
			u64 *total_trimmed, u64 start, u64 end, u64 minlen,
D
Dennis Zhou 已提交
3613
			u64 maxlen, bool async)
3614
{
3615 3616
	struct btrfs_discard_ctl *discard_ctl =
					&block_group->fs_info->discard_ctl;
3617 3618 3619 3620 3621 3622
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
	struct btrfs_free_space *entry;
	int ret = 0;
	int ret2;
	u64 bytes;
	u64 offset = offset_to_bitmap(ctl, start);
3623
	const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
3624 3625 3626

	while (offset < end) {
		bool next_bitmap = false;
3627
		struct btrfs_trim_range trim_entry;
3628

3629
		mutex_lock(&ctl->cache_writeout_mutex);
3630 3631 3632
		spin_lock(&ctl->tree_lock);

		if (ctl->free_space < minlen) {
3633 3634
			block_group->discard_cursor =
				btrfs_block_group_end(block_group);
3635
			spin_unlock(&ctl->tree_lock);
3636
			mutex_unlock(&ctl->cache_writeout_mutex);
3637 3638 3639 3640
			break;
		}

		entry = tree_search_offset(ctl, offset, 1, 0);
D
Dennis Zhou 已提交
3641 3642 3643 3644 3645 3646 3647 3648 3649
		/*
		 * Bitmaps are marked trimmed lossily now to prevent constant
		 * discarding of the same bitmap (the reason why we are bound
		 * by the filters).  So, retrim the block group bitmaps when we
		 * are preparing to punt to the unused_bgs list.  This uses
		 * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED
		 * which is the only discard index which sets minlen to 0.
		 */
		if (!entry || (async && minlen && start == offset &&
3650
			       btrfs_free_space_trimmed(entry))) {
3651
			spin_unlock(&ctl->tree_lock);
3652
			mutex_unlock(&ctl->cache_writeout_mutex);
3653 3654 3655 3656
			next_bitmap = true;
			goto next;
		}

3657 3658 3659 3660 3661 3662 3663 3664 3665
		/*
		 * Async discard bitmap trimming begins at by setting the start
		 * to be key.objectid and the offset_to_bitmap() aligns to the
		 * start of the bitmap.  This lets us know we are fully
		 * scanning the bitmap rather than only some portion of it.
		 */
		if (start == offset)
			entry->trim_state = BTRFS_TRIM_STATE_TRIMMING;

3666
		bytes = minlen;
3667
		ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
3668
		if (ret2 || start >= end) {
3669
			/*
D
Dennis Zhou 已提交
3670 3671
			 * We lossily consider a bitmap trimmed if we only skip
			 * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER.
3672
			 */
D
Dennis Zhou 已提交
3673
			if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER)
3674
				end_trimming_bitmap(ctl, entry);
3675 3676
			else
				entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3677
			spin_unlock(&ctl->tree_lock);
3678
			mutex_unlock(&ctl->cache_writeout_mutex);
3679 3680 3681 3682
			next_bitmap = true;
			goto next;
		}

3683 3684 3685 3686 3687 3688 3689 3690 3691 3692
		/*
		 * We already trimmed a region, but are using the locking above
		 * to reset the trim_state.
		 */
		if (async && *total_trimmed) {
			spin_unlock(&ctl->tree_lock);
			mutex_unlock(&ctl->cache_writeout_mutex);
			goto out;
		}

3693
		bytes = min(bytes, end - start);
D
Dennis Zhou 已提交
3694
		if (bytes < minlen || (async && maxlen && bytes > maxlen)) {
3695
			spin_unlock(&ctl->tree_lock);
3696
			mutex_unlock(&ctl->cache_writeout_mutex);
3697 3698 3699
			goto next;
		}

D
Dennis Zhou 已提交
3700 3701 3702 3703 3704 3705 3706 3707 3708
		/*
		 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
		 * If X < @minlen, we won't trim X when we come back around.
		 * So trim it now.  We differ here from trimming extents as we
		 * don't keep individual state per bit.
		 */
		if (async &&
		    max_discard_size &&
		    bytes > (max_discard_size + minlen))
3709
			bytes = max_discard_size;
3710

3711 3712 3713 3714 3715
		bitmap_clear_bits(ctl, entry, start, bytes);
		if (entry->bytes == 0)
			free_bitmap(ctl, entry);

		spin_unlock(&ctl->tree_lock);
3716 3717 3718 3719
		trim_entry.start = start;
		trim_entry.bytes = bytes;
		list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
		mutex_unlock(&ctl->cache_writeout_mutex);
3720 3721

		ret = do_trimming(block_group, total_trimmed, start, bytes,
3722
				  start, bytes, 0, &trim_entry);
3723 3724
		if (ret) {
			reset_trimming_bitmap(ctl, offset);
3725 3726
			block_group->discard_cursor =
				btrfs_block_group_end(block_group);
3727
			break;
3728
		}
3729 3730 3731
next:
		if (next_bitmap) {
			offset += BITS_PER_BITMAP * ctl->unit;
3732
			start = offset;
3733 3734
		} else {
			start += bytes;
3735
		}
3736
		block_group->discard_cursor = start;
3737 3738

		if (fatal_signal_pending(current)) {
3739 3740
			if (start != offset)
				reset_trimming_bitmap(ctl, offset);
3741 3742 3743 3744 3745 3746 3747
			ret = -ERESTARTSYS;
			break;
		}

		cond_resched();
	}

3748 3749 3750 3751
	if (offset >= end)
		block_group->discard_cursor = end;

out:
3752 3753
	return ret;
}
3754

3755
int btrfs_trim_block_group(struct btrfs_block_group *block_group,
3756 3757
			   u64 *trimmed, u64 start, u64 end, u64 minlen)
{
3758
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3759
	int ret;
3760
	u64 rem = 0;
3761 3762 3763 3764 3765

	*trimmed = 0;

	spin_lock(&block_group->lock);
	if (block_group->removed) {
3766
		spin_unlock(&block_group->lock);
3767
		return 0;
3768
	}
3769
	btrfs_freeze_block_group(block_group);
3770 3771
	spin_unlock(&block_group->lock);

3772
	ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false);
3773 3774
	if (ret)
		goto out;
3775

D
Dennis Zhou 已提交
3776
	ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false);
3777 3778 3779 3780
	div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem);
	/* If we ended in the middle of a bitmap, reset the trimming flag */
	if (rem)
		reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end));
3781
out:
3782
	btrfs_unfreeze_block_group(block_group);
3783 3784 3785
	return ret;
}

3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798
int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
				   u64 *trimmed, u64 start, u64 end, u64 minlen,
				   bool async)
{
	int ret;

	*trimmed = 0;

	spin_lock(&block_group->lock);
	if (block_group->removed) {
		spin_unlock(&block_group->lock);
		return 0;
	}
3799
	btrfs_freeze_block_group(block_group);
3800 3801 3802
	spin_unlock(&block_group->lock);

	ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async);
3803
	btrfs_unfreeze_block_group(block_group);
3804 3805 3806 3807 3808 3809

	return ret;
}

int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
				   u64 *trimmed, u64 start, u64 end, u64 minlen,
D
Dennis Zhou 已提交
3810
				   u64 maxlen, bool async)
3811 3812 3813 3814 3815 3816 3817 3818 3819 3820
{
	int ret;

	*trimmed = 0;

	spin_lock(&block_group->lock);
	if (block_group->removed) {
		spin_unlock(&block_group->lock);
		return 0;
	}
3821
	btrfs_freeze_block_group(block_group);
3822 3823
	spin_unlock(&block_group->lock);

D
Dennis Zhou 已提交
3824 3825 3826
	ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen,
			   async);

3827
	btrfs_unfreeze_block_group(block_group);
3828 3829 3830 3831

	return ret;
}

3832 3833 3834 3835 3836
bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info)
{
	return btrfs_super_cache_generation(fs_info->super_copy);
}

3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857
static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info,
				       struct btrfs_trans_handle *trans)
{
	struct btrfs_block_group *block_group;
	struct rb_node *node;
	int ret;

	btrfs_info(fs_info, "cleaning free space cache v1");

	node = rb_first(&fs_info->block_group_cache_tree);
	while (node) {
		block_group = rb_entry(node, struct btrfs_block_group, cache_node);
		ret = btrfs_remove_free_space_inode(trans, NULL, block_group);
		if (ret)
			goto out;
		node = rb_next(node);
	}
out:
	return ret;
}

3858 3859 3860 3861 3862 3863
int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active)
{
	struct btrfs_trans_handle *trans;
	int ret;

	/*
3864 3865 3866 3867 3868 3869
	 * update_super_roots will appropriately set or unset
	 * super_copy->cache_generation based on SPACE_CACHE and
	 * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a
	 * transaction commit whether we are enabling space cache v1 and don't
	 * have any other work to do, or are disabling it and removing free
	 * space inodes.
3870 3871 3872 3873 3874
	 */
	trans = btrfs_start_transaction(fs_info->tree_root, 0);
	if (IS_ERR(trans))
		return PTR_ERR(trans);

3875
	if (!active) {
3876
		set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
3877 3878 3879 3880 3881 3882 3883
		ret = cleanup_free_space_cache_v1(fs_info, trans);
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			btrfs_end_transaction(trans);
			goto out;
		}
	}
3884 3885

	ret = btrfs_commit_transaction(trans);
3886
out:
3887 3888 3889 3890 3891
	clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);

	return ret;
}

3892
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
3893 3894 3895 3896 3897 3898
/*
 * Use this if you need to make a bitmap or extent entry specifically, it
 * doesn't do any of the merging that add_free_space does, this acts a lot like
 * how the free space cache loading stuff works, so you can get really weird
 * configurations.
 */
3899
int test_add_free_space_entry(struct btrfs_block_group *cache,
3900
			      u64 offset, u64 bytes, bool bitmap)
3901
{
3902 3903 3904
	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
	struct btrfs_free_space *info = NULL, *bitmap_info;
	void *map = NULL;
3905
	enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED;
3906 3907
	u64 bytes_added;
	int ret;
3908

3909 3910 3911 3912 3913
again:
	if (!info) {
		info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
		if (!info)
			return -ENOMEM;
3914 3915
	}

3916 3917 3918 3919
	if (!bitmap) {
		spin_lock(&ctl->tree_lock);
		info->offset = offset;
		info->bytes = bytes;
3920
		info->max_extent_size = 0;
3921 3922 3923 3924 3925 3926 3927 3928
		ret = link_free_space(ctl, info);
		spin_unlock(&ctl->tree_lock);
		if (ret)
			kmem_cache_free(btrfs_free_space_cachep, info);
		return ret;
	}

	if (!map) {
3929
		map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS);
3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943
		if (!map) {
			kmem_cache_free(btrfs_free_space_cachep, info);
			return -ENOMEM;
		}
	}

	spin_lock(&ctl->tree_lock);
	bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
					 1, 0);
	if (!bitmap_info) {
		info->bitmap = map;
		map = NULL;
		add_new_bitmap(ctl, info, offset);
		bitmap_info = info;
3944
		info = NULL;
3945
	}
3946

3947 3948
	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
					  trim_state);
3949

3950 3951 3952
	bytes -= bytes_added;
	offset += bytes_added;
	spin_unlock(&ctl->tree_lock);
3953

3954 3955
	if (bytes)
		goto again;
3956

3957 3958
	if (info)
		kmem_cache_free(btrfs_free_space_cachep, info);
3959 3960
	if (map)
		kmem_cache_free(btrfs_free_space_bitmap_cachep, map);
3961
	return 0;
3962 3963 3964 3965 3966 3967 3968
}

/*
 * Checks to see if the given range is in the free space cache.  This is really
 * just used to check the absence of space, so if there is free space in the
 * range at all we will return 1.
 */
3969
int test_check_exists(struct btrfs_block_group *cache,
3970
		      u64 offset, u64 bytes)
3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992
{
	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
	struct btrfs_free_space *info;
	int ret = 0;

	spin_lock(&ctl->tree_lock);
	info = tree_search_offset(ctl, offset, 0, 0);
	if (!info) {
		info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
					  1, 0);
		if (!info)
			goto out;
	}

have_info:
	if (info->bitmap) {
		u64 bit_off, bit_bytes;
		struct rb_node *n;
		struct btrfs_free_space *tmp;

		bit_off = offset;
		bit_bytes = ctl->unit;
3993
		ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011
		if (!ret) {
			if (bit_off == offset) {
				ret = 1;
				goto out;
			} else if (bit_off > offset &&
				   offset + bytes > bit_off) {
				ret = 1;
				goto out;
			}
		}

		n = rb_prev(&info->offset_index);
		while (n) {
			tmp = rb_entry(n, struct btrfs_free_space,
				       offset_index);
			if (tmp->offset + tmp->bytes < offset)
				break;
			if (offset + bytes < tmp->offset) {
4012
				n = rb_prev(&tmp->offset_index);
4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025
				continue;
			}
			info = tmp;
			goto have_info;
		}

		n = rb_next(&info->offset_index);
		while (n) {
			tmp = rb_entry(n, struct btrfs_free_space,
				       offset_index);
			if (offset + bytes < tmp->offset)
				break;
			if (tmp->offset + tmp->bytes < offset) {
4026
				n = rb_next(&tmp->offset_index);
4027 4028 4029 4030 4031 4032
				continue;
			}
			info = tmp;
			goto have_info;
		}

4033
		ret = 0;
4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047
		goto out;
	}

	if (info->offset == offset) {
		ret = 1;
		goto out;
	}

	if (offset > info->offset && offset < info->offset + info->bytes)
		ret = 1;
out:
	spin_unlock(&ctl->tree_lock);
	return ret;
}
4048
#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */