ordered-data.c 31.0 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
C
Chris Mason 已提交
2 3 4 5 6
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

#include <linux/slab.h>
7
#include <linux/blkdev.h>
8
#include <linux/writeback.h>
9
#include <linux/sched/mm.h>
10
#include "misc.h"
C
Chris Mason 已提交
11 12 13
#include "ctree.h"
#include "transaction.h"
#include "btrfs_inode.h"
14
#include "extent_io.h"
15
#include "disk-io.h"
16
#include "compression.h"
17
#include "delalloc-space.h"
18
#include "qgroup.h"
19
#include "subpage.h"
C
Chris Mason 已提交
20

21 22
static struct kmem_cache *btrfs_ordered_extent_cache;

23
static u64 entry_end(struct btrfs_ordered_extent *entry)
C
Chris Mason 已提交
24
{
25
	if (entry->file_offset + entry->num_bytes < entry->file_offset)
26
		return (u64)-1;
27
	return entry->file_offset + entry->num_bytes;
C
Chris Mason 已提交
28 29
}

C
Chris Mason 已提交
30 31 32
/* returns NULL if the insertion worked, or it returns the node it did find
 * in the tree
 */
33 34
static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
				   struct rb_node *node)
C
Chris Mason 已提交
35
{
C
Chris Mason 已提交
36 37
	struct rb_node **p = &root->rb_node;
	struct rb_node *parent = NULL;
38
	struct btrfs_ordered_extent *entry;
C
Chris Mason 已提交
39

C
Chris Mason 已提交
40
	while (*p) {
C
Chris Mason 已提交
41
		parent = *p;
42
		entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
C
Chris Mason 已提交
43

44
		if (file_offset < entry->file_offset)
C
Chris Mason 已提交
45
			p = &(*p)->rb_left;
46
		else if (file_offset >= entry_end(entry))
C
Chris Mason 已提交
47 48 49 50 51 52 53 54 55 56
			p = &(*p)->rb_right;
		else
			return parent;
	}

	rb_link_node(node, parent, p);
	rb_insert_color(node, root);
	return NULL;
}

C
Chris Mason 已提交
57 58 59 60
/*
 * look for a given offset in the tree, and if it can't be found return the
 * first lesser offset
 */
61 62
static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
				     struct rb_node **prev_ret)
C
Chris Mason 已提交
63
{
C
Chris Mason 已提交
64
	struct rb_node *n = root->rb_node;
C
Chris Mason 已提交
65
	struct rb_node *prev = NULL;
66 67 68
	struct rb_node *test;
	struct btrfs_ordered_extent *entry;
	struct btrfs_ordered_extent *prev_entry = NULL;
C
Chris Mason 已提交
69

C
Chris Mason 已提交
70
	while (n) {
71
		entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
C
Chris Mason 已提交
72 73 74
		prev = n;
		prev_entry = entry;

75
		if (file_offset < entry->file_offset)
C
Chris Mason 已提交
76
			n = n->rb_left;
77
		else if (file_offset >= entry_end(entry))
C
Chris Mason 已提交
78 79 80 81 82 83 84
			n = n->rb_right;
		else
			return n;
	}
	if (!prev_ret)
		return NULL;

C
Chris Mason 已提交
85
	while (prev && file_offset >= entry_end(prev_entry)) {
86 87 88 89 90 91 92 93 94 95 96 97 98
		test = rb_next(prev);
		if (!test)
			break;
		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
				      rb_node);
		if (file_offset < entry_end(prev_entry))
			break;

		prev = test;
	}
	if (prev)
		prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
				      rb_node);
C
Chris Mason 已提交
99
	while (prev && file_offset < entry_end(prev_entry)) {
100 101 102 103 104 105
		test = rb_prev(prev);
		if (!test)
			break;
		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
				      rb_node);
		prev = test;
C
Chris Mason 已提交
106 107 108 109 110
	}
	*prev_ret = prev;
	return NULL;
}

111 112 113 114
static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
			  u64 len)
{
	if (file_offset + len <= entry->file_offset ||
115
	    entry->file_offset + entry->num_bytes <= file_offset)
116 117 118 119
		return 0;
	return 1;
}

C
Chris Mason 已提交
120 121 122 123
/*
 * look find the first ordered struct that has this offset, otherwise
 * the first one less than this offset
 */
124 125
static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
					  u64 file_offset)
C
Chris Mason 已提交
126
{
127
	struct rb_root *root = &tree->tree;
128
	struct rb_node *prev = NULL;
C
Chris Mason 已提交
129
	struct rb_node *ret;
130 131 132 133 134
	struct btrfs_ordered_extent *entry;

	if (tree->last) {
		entry = rb_entry(tree->last, struct btrfs_ordered_extent,
				 rb_node);
135
		if (in_range(file_offset, entry->file_offset, entry->num_bytes))
136 137 138
			return tree->last;
	}
	ret = __tree_search(root, file_offset, &prev);
C
Chris Mason 已提交
139
	if (!ret)
140 141 142
		ret = prev;
	if (ret)
		tree->last = ret;
C
Chris Mason 已提交
143 144 145
	return ret;
}

146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
/**
 * Add an ordered extent to the per-inode tree.
 *
 * @inode:           Inode that this extent is for.
 * @file_offset:     Logical offset in file where the extent starts.
 * @num_bytes:       Logical length of extent in file.
 * @ram_bytes:       Full length of unencoded data.
 * @disk_bytenr:     Offset of extent on disk.
 * @disk_num_bytes:  Size of extent on disk.
 * @offset:          Offset into unencoded data where file data starts.
 * @flags:           Flags specifying type of extent (1 << BTRFS_ORDERED_*).
 * @compress_type:   Compression algorithm used for data.
 *
 * Most of these parameters correspond to &struct btrfs_file_extent_item. The
 * tree is given a single reference on the ordered extent that was inserted.
161
 *
162
 * Return: 0 or -ENOMEM.
163
 */
164 165 166 167
int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
			     u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
			     u64 disk_num_bytes, u64 offset, unsigned flags,
			     int compress_type)
C
Chris Mason 已提交
168
{
169 170 171
	struct btrfs_root *root = inode->root;
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
172 173
	struct rb_node *node;
	struct btrfs_ordered_extent *entry;
174 175
	int ret;

176 177
	if (flags &
	    ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) {
178
		/* For nocow write, we can release the qgroup rsv right now */
179
		ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
180 181 182 183 184 185 186 187
		if (ret < 0)
			return ret;
		ret = 0;
	} else {
		/*
		 * The ordered extent has reserved qgroup space, release now
		 * and pass the reserved number for qgroup_record to free.
		 */
188
		ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes);
189 190 191
		if (ret < 0)
			return ret;
	}
192
	entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
C
Chris Mason 已提交
193 194 195
	if (!entry)
		return -ENOMEM;

196
	entry->file_offset = file_offset;
197
	entry->num_bytes = num_bytes;
198 199
	entry->ram_bytes = ram_bytes;
	entry->disk_bytenr = disk_bytenr;
200
	entry->disk_num_bytes = disk_num_bytes;
201
	entry->offset = offset;
202
	entry->bytes_left = num_bytes;
203
	entry->inode = igrab(&inode->vfs_inode);
204
	entry->compress_type = compress_type;
205
	entry->truncated_len = (u64)-1;
206
	entry->qgroup_rsv = ret;
207
	entry->physical = (u64)-1;
208

209 210
	ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0);
	entry->flags = flags;
211

212 213 214
	percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes,
				 fs_info->delalloc_batch);

215
	/* one ref for the tree */
216
	refcount_set(&entry->refs, 1);
217 218
	init_waitqueue_head(&entry->wait);
	INIT_LIST_HEAD(&entry->list);
219
	INIT_LIST_HEAD(&entry->log_list);
220
	INIT_LIST_HEAD(&entry->root_extent_list);
221 222
	INIT_LIST_HEAD(&entry->work_list);
	init_completion(&entry->completion);
C
Chris Mason 已提交
223

224
	trace_btrfs_ordered_extent_add(inode, entry);
225

226
	spin_lock_irq(&tree->lock);
227 228
	node = tree_insert(&tree->tree, file_offset,
			   &entry->rb_node);
229
	if (node)
230 231 232
		btrfs_panic(fs_info, -EEXIST,
				"inconsistency in ordered tree at offset %llu",
				file_offset);
233
	spin_unlock_irq(&tree->lock);
C
Chris Mason 已提交
234

235
	spin_lock(&root->ordered_extent_lock);
236
	list_add_tail(&entry->root_extent_list,
237 238 239
		      &root->ordered_extents);
	root->nr_ordered_extents++;
	if (root->nr_ordered_extents == 1) {
240
		spin_lock(&fs_info->ordered_root_lock);
241
		BUG_ON(!list_empty(&root->ordered_root));
242 243
		list_add_tail(&root->ordered_root, &fs_info->ordered_roots);
		spin_unlock(&fs_info->ordered_root_lock);
244 245
	}
	spin_unlock(&root->ordered_extent_lock);
246

J
Josef Bacik 已提交
247 248 249 250 251
	/*
	 * We don't need the count_max_extents here, we can assume that all of
	 * that work has been done at higher layers, so this is truly the
	 * smallest the extent is going to get.
	 */
252 253 254
	spin_lock(&inode->lock);
	btrfs_mod_outstanding_extents(inode, 1);
	spin_unlock(&inode->lock);
J
Josef Bacik 已提交
255

C
Chris Mason 已提交
256 257 258
	return 0;
}

259 260
/*
 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
261 262
 * when an ordered extent is finished.  If the list covers more than one
 * ordered extent, it is split across multiples.
263
 */
264
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
265
			   struct btrfs_ordered_sum *sum)
C
Chris Mason 已提交
266
{
267
	struct btrfs_ordered_inode_tree *tree;
C
Chris Mason 已提交
268

269
	tree = &BTRFS_I(entry->inode)->ordered_tree;
270
	spin_lock_irq(&tree->lock);
271
	list_add_tail(&sum->list, &entry->list);
272
	spin_unlock_irq(&tree->lock);
C
Chris Mason 已提交
273 274
}

275 276 277 278 279 280 281 282
static void finish_ordered_fn(struct btrfs_work *work)
{
	struct btrfs_ordered_extent *ordered_extent;

	ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
	btrfs_finish_ordered_io(ordered_extent);
}

283
/*
284
 * Mark all ordered extents io inside the specified range finished.
285
 *
D
David Sterba 已提交
286
 * @page:	 The involved page for the operation.
287 288 289 290 291
 *		 For uncompressed buffered IO, the page status also needs to be
 *		 updated to indicate whether the pending ordered io is finished.
 *		 Can be NULL for direct IO and compressed write.
 *		 For these cases, callers are ensured they won't execute the
 *		 endio function twice.
292
 *
293
 * This function is called for endio, thus the range must have ordered
D
David Sterba 已提交
294
 * extent(s) covering it.
295
 */
296
void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
297 298
				    struct page *page, u64 file_offset,
				    u64 num_bytes, bool uptodate)
299
{
300
	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
301 302
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
	struct btrfs_workqueue *wq;
303 304
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;
305
	unsigned long flags;
306 307 308 309 310 311 312 313 314 315
	u64 cur = file_offset;

	if (btrfs_is_free_space_inode(inode))
		wq = fs_info->endio_freespace_worker;
	else
		wq = fs_info->endio_write_workers;

	if (page)
		ASSERT(page->mapping && page_offset(page) <= file_offset &&
		       file_offset + num_bytes <= page_offset(page) + PAGE_SIZE);
316

317
	spin_lock_irqsave(&tree->lock, flags);
318 319 320 321 322 323 324 325 326
	while (cur < file_offset + num_bytes) {
		u64 entry_end;
		u64 end;
		u32 len;

		node = tree_search(tree, cur);
		/* No ordered extents at all */
		if (!node)
			break;
327

328 329
		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
		entry_end = entry->file_offset + entry->num_bytes;
330
		/*
331 332 333
		 * |<-- OE --->|  |
		 *		  cur
		 * Go to next OE.
334
		 */
335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370
		if (cur >= entry_end) {
			node = rb_next(node);
			/* No more ordered extents, exit */
			if (!node)
				break;
			entry = rb_entry(node, struct btrfs_ordered_extent,
					 rb_node);

			/* Go to next ordered extent and continue */
			cur = entry->file_offset;
			continue;
		}
		/*
		 * |	|<--- OE --->|
		 * cur
		 * Go to the start of OE.
		 */
		if (cur < entry->file_offset) {
			cur = entry->file_offset;
			continue;
		}

		/*
		 * Now we are definitely inside one ordered extent.
		 *
		 * |<--- OE --->|
		 *	|
		 *	cur
		 */
		end = min(entry->file_offset + entry->num_bytes,
			  file_offset + num_bytes) - 1;
		ASSERT(end + 1 - cur < U32_MAX);
		len = end + 1 - cur;

		if (page) {
			/*
371 372
			 * Ordered (Private2) bit indicates whether we still
			 * have pending io unfinished for the ordered extent.
373 374 375
			 *
			 * If there's no such bit, we need to skip to next range.
			 */
376
			if (!btrfs_page_test_ordered(fs_info, page, cur, len)) {
377 378 379
				cur += len;
				continue;
			}
380
			btrfs_page_clear_ordered(fs_info, page, cur, len);
381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408
		}

		/* Now we're fine to update the accounting */
		if (unlikely(len > entry->bytes_left)) {
			WARN_ON(1);
			btrfs_crit(fs_info,
"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%u left=%llu",
				   inode->root->root_key.objectid,
				   btrfs_ino(inode),
				   entry->file_offset,
				   entry->num_bytes,
				   len, entry->bytes_left);
			entry->bytes_left = 0;
		} else {
			entry->bytes_left -= len;
		}

		if (!uptodate)
			set_bit(BTRFS_ORDERED_IOERR, &entry->flags);

		/*
		 * All the IO of the ordered extent is finished, we need to queue
		 * the finish_func to be executed.
		 */
		if (entry->bytes_left == 0) {
			set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
			cond_wake_up(&entry->wait);
			refcount_inc(&entry->refs);
409
			trace_btrfs_ordered_extent_mark_finished(inode, entry);
410
			spin_unlock_irqrestore(&tree->lock, flags);
411
			btrfs_init_work(&entry->work, finish_ordered_fn, NULL, NULL);
412 413 414 415
			btrfs_queue_work(wq, &entry->work);
			spin_lock_irqsave(&tree->lock, flags);
		}
		cur += len;
416
	}
417
	spin_unlock_irqrestore(&tree->lock, flags);
418 419
}

420
/*
421 422 423 424 425 426 427 428
 * Finish IO for one ordered extent across a given range.  The range can only
 * contain one ordered extent.
 *
 * @cached:	 The cached ordered extent. If not NULL, we can skip the tree
 *               search and use the ordered extent directly.
 * 		 Will be also used to store the finished ordered extent.
 * @file_offset: File offset for the finished IO
 * @io_size:	 Length of the finish IO range
429
 *
430 431 432 433 434 435
 * Return true if the ordered extent is finished in the range, and update
 * @cached.
 * Return false otherwise.
 *
 * NOTE: The range can NOT cross multiple ordered extents.
 * Thus caller should ensure the range doesn't cross ordered extents.
436
 */
437 438
bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
				    struct btrfs_ordered_extent **cached,
439
				    u64 file_offset, u64 io_size)
C
Chris Mason 已提交
440
{
441
	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
C
Chris Mason 已提交
442
	struct rb_node *node;
443
	struct btrfs_ordered_extent *entry = NULL;
444
	unsigned long flags;
445
	bool finished = false;
446

447 448 449 450 451 452
	spin_lock_irqsave(&tree->lock, flags);
	if (cached && *cached) {
		entry = *cached;
		goto have_entry;
	}

453
	node = tree_search(tree, file_offset);
454
	if (!node)
455
		goto out;
C
Chris Mason 已提交
456

457
	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
458
have_entry:
459
	if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
460 461
		goto out;

462
	if (io_size > entry->bytes_left)
463
		btrfs_crit(inode->root->fs_info,
464
			   "bad ordered accounting left %llu size %llu",
465
		       entry->bytes_left, io_size);
466

467
	entry->bytes_left -= io_size;
468

469
	if (entry->bytes_left == 0) {
470 471 472 473 474
		/*
		 * Ensure only one caller can set the flag and finished_ret
		 * accordingly
		 */
		finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
475 476
		/* test_and_set_bit implies a barrier */
		cond_wake_up_nomb(&entry->wait);
477
	}
478
out:
479
	if (finished && cached && entry) {
480
		*cached = entry;
481
		refcount_inc(&entry->refs);
482
		trace_btrfs_ordered_extent_dec_test_pending(inode, entry);
483
	}
484
	spin_unlock_irqrestore(&tree->lock, flags);
485
	return finished;
486
}
C
Chris Mason 已提交
487

488 489 490 491
/*
 * used to drop a reference on an ordered extent.  This will free
 * the extent if the last reference is dropped
 */
492
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
493
{
494 495 496
	struct list_head *cur;
	struct btrfs_ordered_sum *sum;

497
	trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry);
498

499
	if (refcount_dec_and_test(&entry->refs)) {
500
		ASSERT(list_empty(&entry->root_extent_list));
501
		ASSERT(list_empty(&entry->log_list));
502
		ASSERT(RB_EMPTY_NODE(&entry->rb_node));
503 504
		if (entry->inode)
			btrfs_add_delayed_iput(entry->inode);
C
Chris Mason 已提交
505
		while (!list_empty(&entry->list)) {
506 507 508
			cur = entry->list.next;
			sum = list_entry(cur, struct btrfs_ordered_sum, list);
			list_del(&sum->list);
509
			kvfree(sum);
510
		}
511
		kmem_cache_free(btrfs_ordered_extent_cache, entry);
512
	}
C
Chris Mason 已提交
513
}
514

515 516
/*
 * remove an ordered extent from the tree.  No references are dropped
517
 * and waiters are woken up.
518
 */
519
void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
520
				 struct btrfs_ordered_extent *entry)
521
{
522
	struct btrfs_ordered_inode_tree *tree;
J
Josef Bacik 已提交
523
	struct btrfs_root *root = btrfs_inode->root;
524
	struct btrfs_fs_info *fs_info = root->fs_info;
525
	struct rb_node *node;
526
	bool pending;
527

J
Josef Bacik 已提交
528 529 530 531
	/* This is paired with btrfs_add_ordered_extent. */
	spin_lock(&btrfs_inode->lock);
	btrfs_mod_outstanding_extents(btrfs_inode, -1);
	spin_unlock(&btrfs_inode->lock);
532 533 534 535 536 537 538 539 540
	if (root != fs_info->tree_root) {
		u64 release;

		if (test_bit(BTRFS_ORDERED_ENCODED, &entry->flags))
			release = entry->disk_num_bytes;
		else
			release = entry->num_bytes;
		btrfs_delalloc_release_metadata(btrfs_inode, release, false);
	}
J
Josef Bacik 已提交
541

542 543
	percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
				 fs_info->delalloc_batch);
J
Josef Bacik 已提交
544

J
Josef Bacik 已提交
545
	tree = &btrfs_inode->ordered_tree;
546
	spin_lock_irq(&tree->lock);
547
	node = &entry->rb_node;
548
	rb_erase(node, &tree->tree);
549
	RB_CLEAR_NODE(node);
550 551
	if (tree->last == node)
		tree->last = NULL;
552
	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
553
	pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
554
	spin_unlock_irq(&tree->lock);
555

556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582
	/*
	 * The current running transaction is waiting on us, we need to let it
	 * know that we're complete and wake it up.
	 */
	if (pending) {
		struct btrfs_transaction *trans;

		/*
		 * The checks for trans are just a formality, it should be set,
		 * but if it isn't we don't want to deref/assert under the spin
		 * lock, so be nice and check if trans is set, but ASSERT() so
		 * if it isn't set a developer will notice.
		 */
		spin_lock(&fs_info->trans_lock);
		trans = fs_info->running_transaction;
		if (trans)
			refcount_inc(&trans->use_count);
		spin_unlock(&fs_info->trans_lock);

		ASSERT(trans);
		if (trans) {
			if (atomic_dec_and_test(&trans->pending_ordered))
				wake_up(&trans->pending_wait);
			btrfs_put_transaction(trans);
		}
	}

583
	spin_lock(&root->ordered_extent_lock);
584
	list_del_init(&entry->root_extent_list);
585
	root->nr_ordered_extents--;
586

587
	trace_btrfs_ordered_extent_remove(btrfs_inode, entry);
588

589
	if (!root->nr_ordered_extents) {
590
		spin_lock(&fs_info->ordered_root_lock);
591 592
		BUG_ON(list_empty(&root->ordered_root));
		list_del_init(&root->ordered_root);
593
		spin_unlock(&fs_info->ordered_root_lock);
594 595
	}
	spin_unlock(&root->ordered_extent_lock);
596
	wake_up(&entry->wait);
597 598
}

599
static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
600 601 602 603
{
	struct btrfs_ordered_extent *ordered;

	ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
604
	btrfs_start_ordered_extent(ordered, 1);
605 606 607
	complete(&ordered->completion);
}

C
Chris Mason 已提交
608 609 610 611
/*
 * wait for all the ordered extents in a root.  This is done when balancing
 * space between drives.
 */
612
u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
613
			       const u64 range_start, const u64 range_len)
614
{
615
	struct btrfs_fs_info *fs_info = root->fs_info;
616 617 618
	LIST_HEAD(splice);
	LIST_HEAD(skipped);
	LIST_HEAD(works);
619
	struct btrfs_ordered_extent *ordered, *next;
620
	u64 count = 0;
621
	const u64 range_end = range_start + range_len;
622

623
	mutex_lock(&root->ordered_extent_mutex);
624 625
	spin_lock(&root->ordered_extent_lock);
	list_splice_init(&root->ordered_extents, &splice);
626
	while (!list_empty(&splice) && nr) {
627 628
		ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
					   root_extent_list);
629

630 631
		if (range_end <= ordered->disk_bytenr ||
		    ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) {
632 633 634 635 636
			list_move_tail(&ordered->root_extent_list, &skipped);
			cond_resched_lock(&root->ordered_extent_lock);
			continue;
		}

637 638
		list_move_tail(&ordered->root_extent_list,
			       &root->ordered_extents);
639
		refcount_inc(&ordered->refs);
640
		spin_unlock(&root->ordered_extent_lock);
641

642 643
		btrfs_init_work(&ordered->flush_work,
				btrfs_run_ordered_extent_work, NULL, NULL);
644
		list_add_tail(&ordered->work_list, &works);
645
		btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work);
646

647
		cond_resched();
648
		spin_lock(&root->ordered_extent_lock);
649
		if (nr != U64_MAX)
650 651
			nr--;
		count++;
652
	}
653
	list_splice_tail(&skipped, &root->ordered_extents);
654
	list_splice_tail(&splice, &root->ordered_extents);
655
	spin_unlock(&root->ordered_extent_lock);
656 657 658 659 660 661 662

	list_for_each_entry_safe(ordered, next, &works, work_list) {
		list_del_init(&ordered->work_list);
		wait_for_completion(&ordered->completion);
		btrfs_put_ordered_extent(ordered);
		cond_resched();
	}
663
	mutex_unlock(&root->ordered_extent_mutex);
664 665

	return count;
666 667
}

668
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
669
			     const u64 range_start, const u64 range_len)
670 671 672
{
	struct btrfs_root *root;
	struct list_head splice;
673
	u64 done;
674 675 676

	INIT_LIST_HEAD(&splice);

677
	mutex_lock(&fs_info->ordered_operations_mutex);
678 679
	spin_lock(&fs_info->ordered_root_lock);
	list_splice_init(&fs_info->ordered_roots, &splice);
680
	while (!list_empty(&splice) && nr) {
681 682
		root = list_first_entry(&splice, struct btrfs_root,
					ordered_root);
683
		root = btrfs_grab_root(root);
684 685 686 687 688
		BUG_ON(!root);
		list_move_tail(&root->ordered_root,
			       &fs_info->ordered_roots);
		spin_unlock(&fs_info->ordered_root_lock);

689 690
		done = btrfs_wait_ordered_extents(root, nr,
						  range_start, range_len);
691
		btrfs_put_root(root);
692 693

		spin_lock(&fs_info->ordered_root_lock);
694
		if (nr != U64_MAX) {
695 696
			nr -= done;
		}
697
	}
698
	list_splice_tail(&splice, &fs_info->ordered_roots);
699
	spin_unlock(&fs_info->ordered_root_lock);
700
	mutex_unlock(&fs_info->ordered_operations_mutex);
701 702
}

703 704 705 706 707 708 709
/*
 * Used to start IO or wait for a given ordered extent to finish.
 *
 * If wait is one, this effectively waits on page writeback for all the pages
 * in the extent, and it waits on the io completion code to insert
 * metadata into the btree corresponding to the extent
 */
710
void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
711 712
{
	u64 start = entry->file_offset;
713
	u64 end = start + entry->num_bytes - 1;
714
	struct btrfs_inode *inode = BTRFS_I(entry->inode);
715

716
	trace_btrfs_ordered_extent_start(inode, entry);
717

718 719 720
	/*
	 * pages in the range can be dirty, clean or writeback.  We
	 * start IO on any dirty ones so the wait doesn't stall waiting
721
	 * for the flusher thread to find them
722
	 */
723
	if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
724
		filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
C
Chris Mason 已提交
725
	if (wait) {
726 727
		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
						 &entry->flags));
C
Chris Mason 已提交
728
	}
729
}
730

731 732 733
/*
 * Used to wait on ordered extents across a large range of bytes.
 */
734
int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
735
{
736
	int ret = 0;
737
	int ret_wb = 0;
738
	u64 end;
739
	u64 orig_end;
740
	struct btrfs_ordered_extent *ordered;
741 742

	if (start + len < start) {
743
		orig_end = INT_LIMIT(loff_t);
744 745
	} else {
		orig_end = start + len - 1;
746 747
		if (orig_end > INT_LIMIT(loff_t))
			orig_end = INT_LIMIT(loff_t);
748
	}
749

750 751 752
	/* start IO across the range first to instantiate any delalloc
	 * extents
	 */
753
	ret = btrfs_fdatawrite_range(inode, start, orig_end);
754 755
	if (ret)
		return ret;
756

757 758 759 760 761 762 763 764
	/*
	 * If we have a writeback error don't return immediately. Wait first
	 * for any ordered extents that haven't completed yet. This is to make
	 * sure no one can dirty the same page ranges and call writepages()
	 * before the ordered extents complete - to avoid failures (-EEXIST)
	 * when adding the new ordered extents to the ordered tree.
	 */
	ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
765

766
	end = orig_end;
C
Chris Mason 已提交
767
	while (1) {
768
		ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end);
C
Chris Mason 已提交
769
		if (!ordered)
770
			break;
771
		if (ordered->file_offset > orig_end) {
772 773 774
			btrfs_put_ordered_extent(ordered);
			break;
		}
775
		if (ordered->file_offset + ordered->num_bytes <= start) {
776 777 778
			btrfs_put_ordered_extent(ordered);
			break;
		}
779
		btrfs_start_ordered_extent(ordered, 1);
780
		end = ordered->file_offset;
781 782 783 784 785
		/*
		 * If the ordered extent had an error save the error but don't
		 * exit without waiting first for all other ordered extents in
		 * the range to complete.
		 */
786 787
		if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
			ret = -EIO;
788
		btrfs_put_ordered_extent(ordered);
789
		if (end == 0 || end == start)
790 791 792
			break;
		end--;
	}
793
	return ret_wb ? ret_wb : ret;
794 795
}

796 797 798 799
/*
 * find an ordered extent corresponding to file_offset.  return NULL if
 * nothing is found, otherwise take a reference on the extent and return it
 */
800
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
801 802 803 804 805
							 u64 file_offset)
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;
806
	unsigned long flags;
807

808
	tree = &inode->ordered_tree;
809
	spin_lock_irqsave(&tree->lock, flags);
810 811 812 813 814
	node = tree_search(tree, file_offset);
	if (!node)
		goto out;

	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
815
	if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
816
		entry = NULL;
817
	if (entry) {
818
		refcount_inc(&entry->refs);
819 820
		trace_btrfs_ordered_extent_lookup(inode, entry);
	}
821
out:
822
	spin_unlock_irqrestore(&tree->lock, flags);
823 824 825
	return entry;
}

826 827 828
/* Since the DIO code tries to lock a wide area we need to look for any ordered
 * extents that exist in the range, rather than just the start of the range.
 */
829 830
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
		struct btrfs_inode *inode, u64 file_offset, u64 len)
831 832 833 834 835
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;

836
	tree = &inode->ordered_tree;
837
	spin_lock_irq(&tree->lock);
838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859
	node = tree_search(tree, file_offset);
	if (!node) {
		node = tree_search(tree, file_offset + len);
		if (!node)
			goto out;
	}

	while (1) {
		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
		if (range_overlaps(entry, file_offset, len))
			break;

		if (entry->file_offset >= file_offset + len) {
			entry = NULL;
			break;
		}
		entry = NULL;
		node = rb_next(node);
		if (!node)
			break;
	}
out:
860
	if (entry) {
861
		refcount_inc(&entry->refs);
862 863
		trace_btrfs_ordered_extent_lookup_range(inode, entry);
	}
864
	spin_unlock_irq(&tree->lock);
865 866 867
	return entry;
}

868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891
/*
 * Adds all ordered extents to the given list. The list ends up sorted by the
 * file_offset of the ordered extents.
 */
void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
					   struct list_head *list)
{
	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
	struct rb_node *n;

	ASSERT(inode_is_locked(&inode->vfs_inode));

	spin_lock_irq(&tree->lock);
	for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
		struct btrfs_ordered_extent *ordered;

		ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);

		if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
			continue;

		ASSERT(list_empty(&ordered->log_list));
		list_add_tail(&ordered->log_list, list);
		refcount_inc(&ordered->refs);
892
		trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered);
893 894 895 896
	}
	spin_unlock_irq(&tree->lock);
}

897 898 899 900
/*
 * lookup and return any extent before 'file_offset'.  NULL is returned
 * if none is found
 */
901
struct btrfs_ordered_extent *
902
btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
903 904 905 906 907
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;

908
	tree = &inode->ordered_tree;
909
	spin_lock_irq(&tree->lock);
910 911 912 913 914
	node = tree_search(tree, file_offset);
	if (!node)
		goto out;

	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
915
	refcount_inc(&entry->refs);
916
	trace_btrfs_ordered_extent_lookup_first(inode, entry);
917
out:
918
	spin_unlock_irq(&tree->lock);
919
	return entry;
920
}
921

922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990
/*
 * Lookup the first ordered extent that overlaps the range
 * [@file_offset, @file_offset + @len).
 *
 * The difference between this and btrfs_lookup_first_ordered_extent() is
 * that this one won't return any ordered extent that does not overlap the range.
 * And the difference against btrfs_lookup_ordered_extent() is, this function
 * ensures the first ordered extent gets returned.
 */
struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
			struct btrfs_inode *inode, u64 file_offset, u64 len)
{
	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
	struct rb_node *node;
	struct rb_node *cur;
	struct rb_node *prev;
	struct rb_node *next;
	struct btrfs_ordered_extent *entry = NULL;

	spin_lock_irq(&tree->lock);
	node = tree->tree.rb_node;
	/*
	 * Here we don't want to use tree_search() which will use tree->last
	 * and screw up the search order.
	 * And __tree_search() can't return the adjacent ordered extents
	 * either, thus here we do our own search.
	 */
	while (node) {
		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);

		if (file_offset < entry->file_offset) {
			node = node->rb_left;
		} else if (file_offset >= entry_end(entry)) {
			node = node->rb_right;
		} else {
			/*
			 * Direct hit, got an ordered extent that starts at
			 * @file_offset
			 */
			goto out;
		}
	}
	if (!entry) {
		/* Empty tree */
		goto out;
	}

	cur = &entry->rb_node;
	/* We got an entry around @file_offset, check adjacent entries */
	if (entry->file_offset < file_offset) {
		prev = cur;
		next = rb_next(cur);
	} else {
		prev = rb_prev(cur);
		next = cur;
	}
	if (prev) {
		entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node);
		if (range_overlaps(entry, file_offset, len))
			goto out;
	}
	if (next) {
		entry = rb_entry(next, struct btrfs_ordered_extent, rb_node);
		if (range_overlaps(entry, file_offset, len))
			goto out;
	}
	/* No ordered extent in the range */
	entry = NULL;
out:
991
	if (entry) {
992
		refcount_inc(&entry->refs);
993 994 995
		trace_btrfs_ordered_extent_lookup_first_range(inode, entry);
	}

996 997 998 999
	spin_unlock_irq(&tree->lock);
	return entry;
}

1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012
/*
 * btrfs_flush_ordered_range - Lock the passed range and ensures all pending
 * ordered extents in it are run to completion.
 *
 * @inode:        Inode whose ordered tree is to be searched
 * @start:        Beginning of range to flush
 * @end:          Last byte of range to lock
 * @cached_state: If passed, will return the extent state responsible for the
 * locked range. It's the caller's responsibility to free the cached state.
 *
 * This function always returns with the given range locked, ensuring after it's
 * called no order extent can be pending.
 */
1013
void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
1014 1015 1016 1017
					u64 end,
					struct extent_state **cached_state)
{
	struct btrfs_ordered_extent *ordered;
1018 1019
	struct extent_state *cache = NULL;
	struct extent_state **cachedp = &cache;
1020 1021

	if (cached_state)
1022
		cachedp = cached_state;
1023 1024

	while (1) {
1025
		lock_extent_bits(&inode->io_tree, start, end, cachedp);
1026 1027
		ordered = btrfs_lookup_ordered_range(inode, start,
						     end - start + 1);
1028 1029 1030 1031 1032 1033 1034
		if (!ordered) {
			/*
			 * If no external cached_state has been passed then
			 * decrement the extra ref taken for cachedp since we
			 * aren't exposing it outside of this function
			 */
			if (!cached_state)
1035
				refcount_dec(&cache->refs);
1036
			break;
1037
		}
1038
		unlock_extent_cached(&inode->io_tree, start, end, cachedp);
1039
		btrfs_start_ordered_extent(ordered, 1);
1040 1041 1042 1043
		btrfs_put_ordered_extent(ordered);
	}
}

1044 1045 1046 1047
static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos,
				u64 len)
{
	struct inode *inode = ordered->inode;
1048
	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
1049 1050
	u64 file_offset = ordered->file_offset + pos;
	u64 disk_bytenr = ordered->disk_bytenr + pos;
1051
	unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS;
1052

1053
	/*
1054 1055
	 * The splitting extent is already counted and will be added again in
	 * btrfs_add_ordered_extent_*(). Subtract len to avoid double counting.
1056
	 */
1057
	percpu_counter_add_batch(&fs_info->ordered_bytes, -len,
1058
				 fs_info->delalloc_batch);
1059 1060 1061 1062
	WARN_ON_ONCE(flags & (1 << BTRFS_ORDERED_COMPRESSED));
	return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len,
					disk_bytenr, len, 0, flags,
					ordered->compress_type);
1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073
}

int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
				u64 post)
{
	struct inode *inode = ordered->inode;
	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
	struct rb_node *node;
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	int ret = 0;

1074 1075
	trace_btrfs_ordered_extent_split(BTRFS_I(inode), ordered);

1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100
	spin_lock_irq(&tree->lock);
	/* Remove from tree once */
	node = &ordered->rb_node;
	rb_erase(node, &tree->tree);
	RB_CLEAR_NODE(node);
	if (tree->last == node)
		tree->last = NULL;

	ordered->file_offset += pre;
	ordered->disk_bytenr += pre;
	ordered->num_bytes -= (pre + post);
	ordered->disk_num_bytes -= (pre + post);
	ordered->bytes_left -= (pre + post);

	/* Re-insert the node */
	node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node);
	if (node)
		btrfs_panic(fs_info, -EEXIST,
			"zoned: inconsistency in ordered tree at offset %llu",
			    ordered->file_offset);

	spin_unlock_irq(&tree->lock);

	if (pre)
		ret = clone_ordered_extent(ordered, 0, pre);
1101
	if (ret == 0 && post)
1102 1103 1104 1105 1106 1107
		ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes,
					   post);

	return ret;
}

1108 1109 1110 1111
int __init ordered_data_init(void)
{
	btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
				     sizeof(struct btrfs_ordered_extent), 0,
1112
				     SLAB_MEM_SPREAD,
1113 1114 1115
				     NULL);
	if (!btrfs_ordered_extent_cache)
		return -ENOMEM;
1116

1117 1118 1119
	return 0;
}

1120
void __cold ordered_data_exit(void)
1121
{
1122
	kmem_cache_destroy(btrfs_ordered_extent_cache);
1123
}