ordered-data.c 32.2 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
C
Chris Mason 已提交
2 3 4 5 6
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

#include <linux/slab.h>
7
#include <linux/blkdev.h>
8
#include <linux/writeback.h>
9
#include <linux/sched/mm.h>
10
#include "messages.h"
11
#include "misc.h"
C
Chris Mason 已提交
12 13 14
#include "ctree.h"
#include "transaction.h"
#include "btrfs_inode.h"
15
#include "extent_io.h"
16
#include "disk-io.h"
17
#include "compression.h"
18
#include "delalloc-space.h"
19
#include "qgroup.h"
20
#include "subpage.h"
21
#include "file.h"
22
#include "super.h"
C
Chris Mason 已提交
23

24 25
static struct kmem_cache *btrfs_ordered_extent_cache;

26
static u64 entry_end(struct btrfs_ordered_extent *entry)
C
Chris Mason 已提交
27
{
28
	if (entry->file_offset + entry->num_bytes < entry->file_offset)
29
		return (u64)-1;
30
	return entry->file_offset + entry->num_bytes;
C
Chris Mason 已提交
31 32
}

C
Chris Mason 已提交
33 34 35
/* returns NULL if the insertion worked, or it returns the node it did find
 * in the tree
 */
36 37
static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
				   struct rb_node *node)
C
Chris Mason 已提交
38
{
C
Chris Mason 已提交
39 40
	struct rb_node **p = &root->rb_node;
	struct rb_node *parent = NULL;
41
	struct btrfs_ordered_extent *entry;
C
Chris Mason 已提交
42

C
Chris Mason 已提交
43
	while (*p) {
C
Chris Mason 已提交
44
		parent = *p;
45
		entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
C
Chris Mason 已提交
46

47
		if (file_offset < entry->file_offset)
C
Chris Mason 已提交
48
			p = &(*p)->rb_left;
49
		else if (file_offset >= entry_end(entry))
C
Chris Mason 已提交
50 51 52 53 54 55 56 57 58 59
			p = &(*p)->rb_right;
		else
			return parent;
	}

	rb_link_node(node, parent, p);
	rb_insert_color(node, root);
	return NULL;
}

C
Chris Mason 已提交
60 61 62 63
/*
 * look for a given offset in the tree, and if it can't be found return the
 * first lesser offset
 */
64 65
static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
				     struct rb_node **prev_ret)
C
Chris Mason 已提交
66
{
C
Chris Mason 已提交
67
	struct rb_node *n = root->rb_node;
C
Chris Mason 已提交
68
	struct rb_node *prev = NULL;
69 70 71
	struct rb_node *test;
	struct btrfs_ordered_extent *entry;
	struct btrfs_ordered_extent *prev_entry = NULL;
C
Chris Mason 已提交
72

C
Chris Mason 已提交
73
	while (n) {
74
		entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
C
Chris Mason 已提交
75 76 77
		prev = n;
		prev_entry = entry;

78
		if (file_offset < entry->file_offset)
C
Chris Mason 已提交
79
			n = n->rb_left;
80
		else if (file_offset >= entry_end(entry))
C
Chris Mason 已提交
81 82 83 84 85 86 87
			n = n->rb_right;
		else
			return n;
	}
	if (!prev_ret)
		return NULL;

C
Chris Mason 已提交
88
	while (prev && file_offset >= entry_end(prev_entry)) {
89 90 91 92 93 94 95 96 97 98 99 100 101
		test = rb_next(prev);
		if (!test)
			break;
		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
				      rb_node);
		if (file_offset < entry_end(prev_entry))
			break;

		prev = test;
	}
	if (prev)
		prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
				      rb_node);
C
Chris Mason 已提交
102
	while (prev && file_offset < entry_end(prev_entry)) {
103 104 105 106 107 108
		test = rb_prev(prev);
		if (!test)
			break;
		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
				      rb_node);
		prev = test;
C
Chris Mason 已提交
109 110 111 112 113
	}
	*prev_ret = prev;
	return NULL;
}

114 115 116 117
static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
			  u64 len)
{
	if (file_offset + len <= entry->file_offset ||
118
	    entry->file_offset + entry->num_bytes <= file_offset)
119 120 121 122
		return 0;
	return 1;
}

C
Chris Mason 已提交
123 124 125 126
/*
 * look find the first ordered struct that has this offset, otherwise
 * the first one less than this offset
 */
127 128
static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
					  u64 file_offset)
C
Chris Mason 已提交
129
{
130
	struct rb_root *root = &tree->tree;
131
	struct rb_node *prev = NULL;
C
Chris Mason 已提交
132
	struct rb_node *ret;
133 134 135 136 137
	struct btrfs_ordered_extent *entry;

	if (tree->last) {
		entry = rb_entry(tree->last, struct btrfs_ordered_extent,
				 rb_node);
138
		if (in_range(file_offset, entry->file_offset, entry->num_bytes))
139 140 141
			return tree->last;
	}
	ret = __tree_search(root, file_offset, &prev);
C
Chris Mason 已提交
142
	if (!ret)
143 144 145
		ret = prev;
	if (ret)
		tree->last = ret;
C
Chris Mason 已提交
146 147 148
	return ret;
}

D
David Sterba 已提交
149
/*
150 151 152 153 154 155 156 157 158 159 160 161 162 163
 * Add an ordered extent to the per-inode tree.
 *
 * @inode:           Inode that this extent is for.
 * @file_offset:     Logical offset in file where the extent starts.
 * @num_bytes:       Logical length of extent in file.
 * @ram_bytes:       Full length of unencoded data.
 * @disk_bytenr:     Offset of extent on disk.
 * @disk_num_bytes:  Size of extent on disk.
 * @offset:          Offset into unencoded data where file data starts.
 * @flags:           Flags specifying type of extent (1 << BTRFS_ORDERED_*).
 * @compress_type:   Compression algorithm used for data.
 *
 * Most of these parameters correspond to &struct btrfs_file_extent_item. The
 * tree is given a single reference on the ordered extent that was inserted.
164
 *
165
 * Return: 0 or -ENOMEM.
166
 */
167 168 169 170
int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
			     u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
			     u64 disk_num_bytes, u64 offset, unsigned flags,
			     int compress_type)
C
Chris Mason 已提交
171
{
172 173 174
	struct btrfs_root *root = inode->root;
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
175 176
	struct rb_node *node;
	struct btrfs_ordered_extent *entry;
177 178
	int ret;

179 180
	if (flags &
	    ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) {
181
		/* For nocow write, we can release the qgroup rsv right now */
182
		ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
183 184 185 186 187 188 189 190
		if (ret < 0)
			return ret;
		ret = 0;
	} else {
		/*
		 * The ordered extent has reserved qgroup space, release now
		 * and pass the reserved number for qgroup_record to free.
		 */
191
		ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes);
192 193 194
		if (ret < 0)
			return ret;
	}
195
	entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
C
Chris Mason 已提交
196 197 198
	if (!entry)
		return -ENOMEM;

199
	entry->file_offset = file_offset;
200
	entry->num_bytes = num_bytes;
201 202
	entry->ram_bytes = ram_bytes;
	entry->disk_bytenr = disk_bytenr;
203
	entry->disk_num_bytes = disk_num_bytes;
204
	entry->offset = offset;
205
	entry->bytes_left = num_bytes;
206
	entry->inode = igrab(&inode->vfs_inode);
207
	entry->compress_type = compress_type;
208
	entry->truncated_len = (u64)-1;
209
	entry->qgroup_rsv = ret;
210
	entry->physical = (u64)-1;
211

212 213
	ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0);
	entry->flags = flags;
214

215 216 217
	percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes,
				 fs_info->delalloc_batch);

218
	/* one ref for the tree */
219
	refcount_set(&entry->refs, 1);
220 221
	init_waitqueue_head(&entry->wait);
	INIT_LIST_HEAD(&entry->list);
222
	INIT_LIST_HEAD(&entry->log_list);
223
	INIT_LIST_HEAD(&entry->root_extent_list);
224 225
	INIT_LIST_HEAD(&entry->work_list);
	init_completion(&entry->completion);
C
Chris Mason 已提交
226

227
	trace_btrfs_ordered_extent_add(inode, entry);
228

229
	spin_lock_irq(&tree->lock);
230 231
	node = tree_insert(&tree->tree, file_offset,
			   &entry->rb_node);
232
	if (node)
233 234 235
		btrfs_panic(fs_info, -EEXIST,
				"inconsistency in ordered tree at offset %llu",
				file_offset);
236
	spin_unlock_irq(&tree->lock);
C
Chris Mason 已提交
237

238
	spin_lock(&root->ordered_extent_lock);
239
	list_add_tail(&entry->root_extent_list,
240 241 242
		      &root->ordered_extents);
	root->nr_ordered_extents++;
	if (root->nr_ordered_extents == 1) {
243
		spin_lock(&fs_info->ordered_root_lock);
244
		BUG_ON(!list_empty(&root->ordered_root));
245 246
		list_add_tail(&root->ordered_root, &fs_info->ordered_roots);
		spin_unlock(&fs_info->ordered_root_lock);
247 248
	}
	spin_unlock(&root->ordered_extent_lock);
249

J
Josef Bacik 已提交
250 251 252 253 254
	/*
	 * We don't need the count_max_extents here, we can assume that all of
	 * that work has been done at higher layers, so this is truly the
	 * smallest the extent is going to get.
	 */
255 256 257
	spin_lock(&inode->lock);
	btrfs_mod_outstanding_extents(inode, 1);
	spin_unlock(&inode->lock);
J
Josef Bacik 已提交
258

C
Chris Mason 已提交
259 260 261
	return 0;
}

262 263
/*
 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
264 265
 * when an ordered extent is finished.  If the list covers more than one
 * ordered extent, it is split across multiples.
266
 */
267
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
268
			   struct btrfs_ordered_sum *sum)
C
Chris Mason 已提交
269
{
270
	struct btrfs_ordered_inode_tree *tree;
C
Chris Mason 已提交
271

272
	tree = &BTRFS_I(entry->inode)->ordered_tree;
273
	spin_lock_irq(&tree->lock);
274
	list_add_tail(&sum->list, &entry->list);
275
	spin_unlock_irq(&tree->lock);
C
Chris Mason 已提交
276 277
}

278 279 280 281 282 283 284 285
static void finish_ordered_fn(struct btrfs_work *work)
{
	struct btrfs_ordered_extent *ordered_extent;

	ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
	btrfs_finish_ordered_io(ordered_extent);
}

286
/*
287
 * Mark all ordered extents io inside the specified range finished.
288
 *
D
David Sterba 已提交
289
 * @page:	 The involved page for the operation.
290 291 292 293 294
 *		 For uncompressed buffered IO, the page status also needs to be
 *		 updated to indicate whether the pending ordered io is finished.
 *		 Can be NULL for direct IO and compressed write.
 *		 For these cases, callers are ensured they won't execute the
 *		 endio function twice.
295
 *
296
 * This function is called for endio, thus the range must have ordered
D
David Sterba 已提交
297
 * extent(s) covering it.
298
 */
299
void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
300 301
				    struct page *page, u64 file_offset,
				    u64 num_bytes, bool uptodate)
302
{
303
	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
304 305
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
	struct btrfs_workqueue *wq;
306 307
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;
308
	unsigned long flags;
309 310 311 312 313 314 315 316 317 318
	u64 cur = file_offset;

	if (btrfs_is_free_space_inode(inode))
		wq = fs_info->endio_freespace_worker;
	else
		wq = fs_info->endio_write_workers;

	if (page)
		ASSERT(page->mapping && page_offset(page) <= file_offset &&
		       file_offset + num_bytes <= page_offset(page) + PAGE_SIZE);
319

320
	spin_lock_irqsave(&tree->lock, flags);
321 322 323 324 325 326 327 328 329
	while (cur < file_offset + num_bytes) {
		u64 entry_end;
		u64 end;
		u32 len;

		node = tree_search(tree, cur);
		/* No ordered extents at all */
		if (!node)
			break;
330

331 332
		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
		entry_end = entry->file_offset + entry->num_bytes;
333
		/*
334 335 336
		 * |<-- OE --->|  |
		 *		  cur
		 * Go to next OE.
337
		 */
338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373
		if (cur >= entry_end) {
			node = rb_next(node);
			/* No more ordered extents, exit */
			if (!node)
				break;
			entry = rb_entry(node, struct btrfs_ordered_extent,
					 rb_node);

			/* Go to next ordered extent and continue */
			cur = entry->file_offset;
			continue;
		}
		/*
		 * |	|<--- OE --->|
		 * cur
		 * Go to the start of OE.
		 */
		if (cur < entry->file_offset) {
			cur = entry->file_offset;
			continue;
		}

		/*
		 * Now we are definitely inside one ordered extent.
		 *
		 * |<--- OE --->|
		 *	|
		 *	cur
		 */
		end = min(entry->file_offset + entry->num_bytes,
			  file_offset + num_bytes) - 1;
		ASSERT(end + 1 - cur < U32_MAX);
		len = end + 1 - cur;

		if (page) {
			/*
374 375
			 * Ordered (Private2) bit indicates whether we still
			 * have pending io unfinished for the ordered extent.
376 377 378
			 *
			 * If there's no such bit, we need to skip to next range.
			 */
379
			if (!btrfs_page_test_ordered(fs_info, page, cur, len)) {
380 381 382
				cur += len;
				continue;
			}
383
			btrfs_page_clear_ordered(fs_info, page, cur, len);
384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411
		}

		/* Now we're fine to update the accounting */
		if (unlikely(len > entry->bytes_left)) {
			WARN_ON(1);
			btrfs_crit(fs_info,
"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%u left=%llu",
				   inode->root->root_key.objectid,
				   btrfs_ino(inode),
				   entry->file_offset,
				   entry->num_bytes,
				   len, entry->bytes_left);
			entry->bytes_left = 0;
		} else {
			entry->bytes_left -= len;
		}

		if (!uptodate)
			set_bit(BTRFS_ORDERED_IOERR, &entry->flags);

		/*
		 * All the IO of the ordered extent is finished, we need to queue
		 * the finish_func to be executed.
		 */
		if (entry->bytes_left == 0) {
			set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
			cond_wake_up(&entry->wait);
			refcount_inc(&entry->refs);
412
			trace_btrfs_ordered_extent_mark_finished(inode, entry);
413
			spin_unlock_irqrestore(&tree->lock, flags);
414
			btrfs_init_work(&entry->work, finish_ordered_fn, NULL, NULL);
415 416 417 418
			btrfs_queue_work(wq, &entry->work);
			spin_lock_irqsave(&tree->lock, flags);
		}
		cur += len;
419
	}
420
	spin_unlock_irqrestore(&tree->lock, flags);
421 422
}

423
/*
424 425 426 427 428 429 430 431
 * Finish IO for one ordered extent across a given range.  The range can only
 * contain one ordered extent.
 *
 * @cached:	 The cached ordered extent. If not NULL, we can skip the tree
 *               search and use the ordered extent directly.
 * 		 Will be also used to store the finished ordered extent.
 * @file_offset: File offset for the finished IO
 * @io_size:	 Length of the finish IO range
432
 *
433 434 435 436 437 438
 * Return true if the ordered extent is finished in the range, and update
 * @cached.
 * Return false otherwise.
 *
 * NOTE: The range can NOT cross multiple ordered extents.
 * Thus caller should ensure the range doesn't cross ordered extents.
439
 */
440 441
bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
				    struct btrfs_ordered_extent **cached,
442
				    u64 file_offset, u64 io_size)
C
Chris Mason 已提交
443
{
444
	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
C
Chris Mason 已提交
445
	struct rb_node *node;
446
	struct btrfs_ordered_extent *entry = NULL;
447
	unsigned long flags;
448
	bool finished = false;
449

450 451 452 453 454 455
	spin_lock_irqsave(&tree->lock, flags);
	if (cached && *cached) {
		entry = *cached;
		goto have_entry;
	}

456
	node = tree_search(tree, file_offset);
457
	if (!node)
458
		goto out;
C
Chris Mason 已提交
459

460
	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
461
have_entry:
462
	if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
463 464
		goto out;

465
	if (io_size > entry->bytes_left)
466
		btrfs_crit(inode->root->fs_info,
467
			   "bad ordered accounting left %llu size %llu",
468
		       entry->bytes_left, io_size);
469

470
	entry->bytes_left -= io_size;
471

472
	if (entry->bytes_left == 0) {
473 474 475 476 477
		/*
		 * Ensure only one caller can set the flag and finished_ret
		 * accordingly
		 */
		finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
478 479
		/* test_and_set_bit implies a barrier */
		cond_wake_up_nomb(&entry->wait);
480
	}
481
out:
482
	if (finished && cached && entry) {
483
		*cached = entry;
484
		refcount_inc(&entry->refs);
485
		trace_btrfs_ordered_extent_dec_test_pending(inode, entry);
486
	}
487
	spin_unlock_irqrestore(&tree->lock, flags);
488
	return finished;
489
}
C
Chris Mason 已提交
490

491 492 493 494
/*
 * used to drop a reference on an ordered extent.  This will free
 * the extent if the last reference is dropped
 */
495
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
496
{
497 498 499
	struct list_head *cur;
	struct btrfs_ordered_sum *sum;

500
	trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry);
501

502
	if (refcount_dec_and_test(&entry->refs)) {
503
		ASSERT(list_empty(&entry->root_extent_list));
504
		ASSERT(list_empty(&entry->log_list));
505
		ASSERT(RB_EMPTY_NODE(&entry->rb_node));
506
		if (entry->inode)
507
			btrfs_add_delayed_iput(BTRFS_I(entry->inode));
C
Chris Mason 已提交
508
		while (!list_empty(&entry->list)) {
509 510 511
			cur = entry->list.next;
			sum = list_entry(cur, struct btrfs_ordered_sum, list);
			list_del(&sum->list);
512
			kvfree(sum);
513
		}
514
		kmem_cache_free(btrfs_ordered_extent_cache, entry);
515
	}
C
Chris Mason 已提交
516
}
517

518 519
/*
 * remove an ordered extent from the tree.  No references are dropped
520
 * and waiters are woken up.
521
 */
522
void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
523
				 struct btrfs_ordered_extent *entry)
524
{
525
	struct btrfs_ordered_inode_tree *tree;
J
Josef Bacik 已提交
526
	struct btrfs_root *root = btrfs_inode->root;
527
	struct btrfs_fs_info *fs_info = root->fs_info;
528
	struct rb_node *node;
529
	bool pending;
530 531 532 533 534 535 536
	bool freespace_inode;

	/*
	 * If this is a free space inode the thread has not acquired the ordered
	 * extents lockdep map.
	 */
	freespace_inode = btrfs_is_free_space_inode(btrfs_inode);
537

538
	btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered);
J
Josef Bacik 已提交
539 540 541 542
	/* This is paired with btrfs_add_ordered_extent. */
	spin_lock(&btrfs_inode->lock);
	btrfs_mod_outstanding_extents(btrfs_inode, -1);
	spin_unlock(&btrfs_inode->lock);
543 544 545 546 547 548 549 550 551
	if (root != fs_info->tree_root) {
		u64 release;

		if (test_bit(BTRFS_ORDERED_ENCODED, &entry->flags))
			release = entry->disk_num_bytes;
		else
			release = entry->num_bytes;
		btrfs_delalloc_release_metadata(btrfs_inode, release, false);
	}
J
Josef Bacik 已提交
552

553 554
	percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
				 fs_info->delalloc_batch);
J
Josef Bacik 已提交
555

J
Josef Bacik 已提交
556
	tree = &btrfs_inode->ordered_tree;
557
	spin_lock_irq(&tree->lock);
558
	node = &entry->rb_node;
559
	rb_erase(node, &tree->tree);
560
	RB_CLEAR_NODE(node);
561 562
	if (tree->last == node)
		tree->last = NULL;
563
	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
564
	pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
565
	spin_unlock_irq(&tree->lock);
566

567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593
	/*
	 * The current running transaction is waiting on us, we need to let it
	 * know that we're complete and wake it up.
	 */
	if (pending) {
		struct btrfs_transaction *trans;

		/*
		 * The checks for trans are just a formality, it should be set,
		 * but if it isn't we don't want to deref/assert under the spin
		 * lock, so be nice and check if trans is set, but ASSERT() so
		 * if it isn't set a developer will notice.
		 */
		spin_lock(&fs_info->trans_lock);
		trans = fs_info->running_transaction;
		if (trans)
			refcount_inc(&trans->use_count);
		spin_unlock(&fs_info->trans_lock);

		ASSERT(trans);
		if (trans) {
			if (atomic_dec_and_test(&trans->pending_ordered))
				wake_up(&trans->pending_wait);
			btrfs_put_transaction(trans);
		}
	}

594 595
	btrfs_lockdep_release(fs_info, btrfs_trans_pending_ordered);

596
	spin_lock(&root->ordered_extent_lock);
597
	list_del_init(&entry->root_extent_list);
598
	root->nr_ordered_extents--;
599

600
	trace_btrfs_ordered_extent_remove(btrfs_inode, entry);
601

602
	if (!root->nr_ordered_extents) {
603
		spin_lock(&fs_info->ordered_root_lock);
604 605
		BUG_ON(list_empty(&root->ordered_root));
		list_del_init(&root->ordered_root);
606
		spin_unlock(&fs_info->ordered_root_lock);
607 608
	}
	spin_unlock(&root->ordered_extent_lock);
609
	wake_up(&entry->wait);
610 611
	if (!freespace_inode)
		btrfs_lockdep_release(fs_info, btrfs_ordered_extent);
612 613
}

614
static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
615 616 617 618
{
	struct btrfs_ordered_extent *ordered;

	ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
619
	btrfs_start_ordered_extent(ordered);
620 621 622
	complete(&ordered->completion);
}

C
Chris Mason 已提交
623 624 625 626
/*
 * wait for all the ordered extents in a root.  This is done when balancing
 * space between drives.
 */
627
u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
628
			       const u64 range_start, const u64 range_len)
629
{
630
	struct btrfs_fs_info *fs_info = root->fs_info;
631 632 633
	LIST_HEAD(splice);
	LIST_HEAD(skipped);
	LIST_HEAD(works);
634
	struct btrfs_ordered_extent *ordered, *next;
635
	u64 count = 0;
636
	const u64 range_end = range_start + range_len;
637

638
	mutex_lock(&root->ordered_extent_mutex);
639 640
	spin_lock(&root->ordered_extent_lock);
	list_splice_init(&root->ordered_extents, &splice);
641
	while (!list_empty(&splice) && nr) {
642 643
		ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
					   root_extent_list);
644

645 646
		if (range_end <= ordered->disk_bytenr ||
		    ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) {
647 648 649 650 651
			list_move_tail(&ordered->root_extent_list, &skipped);
			cond_resched_lock(&root->ordered_extent_lock);
			continue;
		}

652 653
		list_move_tail(&ordered->root_extent_list,
			       &root->ordered_extents);
654
		refcount_inc(&ordered->refs);
655
		spin_unlock(&root->ordered_extent_lock);
656

657 658
		btrfs_init_work(&ordered->flush_work,
				btrfs_run_ordered_extent_work, NULL, NULL);
659
		list_add_tail(&ordered->work_list, &works);
660
		btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work);
661

662
		cond_resched();
663
		spin_lock(&root->ordered_extent_lock);
664
		if (nr != U64_MAX)
665 666
			nr--;
		count++;
667
	}
668
	list_splice_tail(&skipped, &root->ordered_extents);
669
	list_splice_tail(&splice, &root->ordered_extents);
670
	spin_unlock(&root->ordered_extent_lock);
671 672 673 674 675 676 677

	list_for_each_entry_safe(ordered, next, &works, work_list) {
		list_del_init(&ordered->work_list);
		wait_for_completion(&ordered->completion);
		btrfs_put_ordered_extent(ordered);
		cond_resched();
	}
678
	mutex_unlock(&root->ordered_extent_mutex);
679 680

	return count;
681 682
}

683
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
684
			     const u64 range_start, const u64 range_len)
685 686 687
{
	struct btrfs_root *root;
	struct list_head splice;
688
	u64 done;
689 690 691

	INIT_LIST_HEAD(&splice);

692
	mutex_lock(&fs_info->ordered_operations_mutex);
693 694
	spin_lock(&fs_info->ordered_root_lock);
	list_splice_init(&fs_info->ordered_roots, &splice);
695
	while (!list_empty(&splice) && nr) {
696 697
		root = list_first_entry(&splice, struct btrfs_root,
					ordered_root);
698
		root = btrfs_grab_root(root);
699 700 701 702 703
		BUG_ON(!root);
		list_move_tail(&root->ordered_root,
			       &fs_info->ordered_roots);
		spin_unlock(&fs_info->ordered_root_lock);

704 705
		done = btrfs_wait_ordered_extents(root, nr,
						  range_start, range_len);
706
		btrfs_put_root(root);
707 708

		spin_lock(&fs_info->ordered_root_lock);
709
		if (nr != U64_MAX) {
710 711
			nr -= done;
		}
712
	}
713
	list_splice_tail(&splice, &fs_info->ordered_roots);
714
	spin_unlock(&fs_info->ordered_root_lock);
715
	mutex_unlock(&fs_info->ordered_operations_mutex);
716 717
}

718
/*
719
 * Start IO and wait for a given ordered extent to finish.
720
 *
721 722
 * Wait on page writeback for all the pages in the extent and the IO completion
 * code to insert metadata into the btree corresponding to the extent.
723
 */
724
void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
725 726
{
	u64 start = entry->file_offset;
727
	u64 end = start + entry->num_bytes - 1;
728
	struct btrfs_inode *inode = BTRFS_I(entry->inode);
729
	bool freespace_inode;
730

731
	trace_btrfs_ordered_extent_start(inode, entry);
732

733 734 735 736 737 738
	/*
	 * If this is a free space inode do not take the ordered extents lockdep
	 * map.
	 */
	freespace_inode = btrfs_is_free_space_inode(inode);

739 740 741
	/*
	 * pages in the range can be dirty, clean or writeback.  We
	 * start IO on any dirty ones so the wait doesn't stall waiting
742
	 * for the flusher thread to find them
743
	 */
744
	if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
745
		filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
746 747 748 749

	if (!freespace_inode)
		btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent);
	wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags));
750
}
751

752 753 754
/*
 * Used to wait on ordered extents across a large range of bytes.
 */
755
int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
756
{
757
	int ret = 0;
758
	int ret_wb = 0;
759
	u64 end;
760
	u64 orig_end;
761
	struct btrfs_ordered_extent *ordered;
762 763

	if (start + len < start) {
764
		orig_end = OFFSET_MAX;
765 766
	} else {
		orig_end = start + len - 1;
767 768
		if (orig_end > OFFSET_MAX)
			orig_end = OFFSET_MAX;
769
	}
770

771 772 773
	/* start IO across the range first to instantiate any delalloc
	 * extents
	 */
774
	ret = btrfs_fdatawrite_range(inode, start, orig_end);
775 776
	if (ret)
		return ret;
777

778 779 780 781 782 783 784 785
	/*
	 * If we have a writeback error don't return immediately. Wait first
	 * for any ordered extents that haven't completed yet. This is to make
	 * sure no one can dirty the same page ranges and call writepages()
	 * before the ordered extents complete - to avoid failures (-EEXIST)
	 * when adding the new ordered extents to the ordered tree.
	 */
	ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
786

787
	end = orig_end;
C
Chris Mason 已提交
788
	while (1) {
789
		ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end);
C
Chris Mason 已提交
790
		if (!ordered)
791
			break;
792
		if (ordered->file_offset > orig_end) {
793 794 795
			btrfs_put_ordered_extent(ordered);
			break;
		}
796
		if (ordered->file_offset + ordered->num_bytes <= start) {
797 798 799
			btrfs_put_ordered_extent(ordered);
			break;
		}
800
		btrfs_start_ordered_extent(ordered);
801
		end = ordered->file_offset;
802 803 804 805 806
		/*
		 * If the ordered extent had an error save the error but don't
		 * exit without waiting first for all other ordered extents in
		 * the range to complete.
		 */
807 808
		if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
			ret = -EIO;
809
		btrfs_put_ordered_extent(ordered);
810
		if (end == 0 || end == start)
811 812 813
			break;
		end--;
	}
814
	return ret_wb ? ret_wb : ret;
815 816
}

817 818 819 820
/*
 * find an ordered extent corresponding to file_offset.  return NULL if
 * nothing is found, otherwise take a reference on the extent and return it
 */
821
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
822 823 824 825 826
							 u64 file_offset)
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;
827
	unsigned long flags;
828

829
	tree = &inode->ordered_tree;
830
	spin_lock_irqsave(&tree->lock, flags);
831 832 833 834 835
	node = tree_search(tree, file_offset);
	if (!node)
		goto out;

	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
836
	if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
837
		entry = NULL;
838
	if (entry) {
839
		refcount_inc(&entry->refs);
840 841
		trace_btrfs_ordered_extent_lookup(inode, entry);
	}
842
out:
843
	spin_unlock_irqrestore(&tree->lock, flags);
844 845 846
	return entry;
}

847 848 849
/* Since the DIO code tries to lock a wide area we need to look for any ordered
 * extents that exist in the range, rather than just the start of the range.
 */
850 851
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
		struct btrfs_inode *inode, u64 file_offset, u64 len)
852 853 854 855 856
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;

857
	tree = &inode->ordered_tree;
858
	spin_lock_irq(&tree->lock);
859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880
	node = tree_search(tree, file_offset);
	if (!node) {
		node = tree_search(tree, file_offset + len);
		if (!node)
			goto out;
	}

	while (1) {
		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
		if (range_overlaps(entry, file_offset, len))
			break;

		if (entry->file_offset >= file_offset + len) {
			entry = NULL;
			break;
		}
		entry = NULL;
		node = rb_next(node);
		if (!node)
			break;
	}
out:
881
	if (entry) {
882
		refcount_inc(&entry->refs);
883 884
		trace_btrfs_ordered_extent_lookup_range(inode, entry);
	}
885
	spin_unlock_irq(&tree->lock);
886 887 888
	return entry;
}

889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912
/*
 * Adds all ordered extents to the given list. The list ends up sorted by the
 * file_offset of the ordered extents.
 */
void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
					   struct list_head *list)
{
	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
	struct rb_node *n;

	ASSERT(inode_is_locked(&inode->vfs_inode));

	spin_lock_irq(&tree->lock);
	for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
		struct btrfs_ordered_extent *ordered;

		ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);

		if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
			continue;

		ASSERT(list_empty(&ordered->log_list));
		list_add_tail(&ordered->log_list, list);
		refcount_inc(&ordered->refs);
913
		trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered);
914 915 916 917
	}
	spin_unlock_irq(&tree->lock);
}

918 919 920 921
/*
 * lookup and return any extent before 'file_offset'.  NULL is returned
 * if none is found
 */
922
struct btrfs_ordered_extent *
923
btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
924 925 926 927 928
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;

929
	tree = &inode->ordered_tree;
930
	spin_lock_irq(&tree->lock);
931 932 933 934 935
	node = tree_search(tree, file_offset);
	if (!node)
		goto out;

	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
936
	refcount_inc(&entry->refs);
937
	trace_btrfs_ordered_extent_lookup_first(inode, entry);
938
out:
939
	spin_unlock_irq(&tree->lock);
940
	return entry;
941
}
942

943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011
/*
 * Lookup the first ordered extent that overlaps the range
 * [@file_offset, @file_offset + @len).
 *
 * The difference between this and btrfs_lookup_first_ordered_extent() is
 * that this one won't return any ordered extent that does not overlap the range.
 * And the difference against btrfs_lookup_ordered_extent() is, this function
 * ensures the first ordered extent gets returned.
 */
struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
			struct btrfs_inode *inode, u64 file_offset, u64 len)
{
	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
	struct rb_node *node;
	struct rb_node *cur;
	struct rb_node *prev;
	struct rb_node *next;
	struct btrfs_ordered_extent *entry = NULL;

	spin_lock_irq(&tree->lock);
	node = tree->tree.rb_node;
	/*
	 * Here we don't want to use tree_search() which will use tree->last
	 * and screw up the search order.
	 * And __tree_search() can't return the adjacent ordered extents
	 * either, thus here we do our own search.
	 */
	while (node) {
		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);

		if (file_offset < entry->file_offset) {
			node = node->rb_left;
		} else if (file_offset >= entry_end(entry)) {
			node = node->rb_right;
		} else {
			/*
			 * Direct hit, got an ordered extent that starts at
			 * @file_offset
			 */
			goto out;
		}
	}
	if (!entry) {
		/* Empty tree */
		goto out;
	}

	cur = &entry->rb_node;
	/* We got an entry around @file_offset, check adjacent entries */
	if (entry->file_offset < file_offset) {
		prev = cur;
		next = rb_next(cur);
	} else {
		prev = rb_prev(cur);
		next = cur;
	}
	if (prev) {
		entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node);
		if (range_overlaps(entry, file_offset, len))
			goto out;
	}
	if (next) {
		entry = rb_entry(next, struct btrfs_ordered_extent, rb_node);
		if (range_overlaps(entry, file_offset, len))
			goto out;
	}
	/* No ordered extent in the range */
	entry = NULL;
out:
1012
	if (entry) {
1013
		refcount_inc(&entry->refs);
1014 1015 1016
		trace_btrfs_ordered_extent_lookup_first_range(inode, entry);
	}

1017 1018 1019 1020
	spin_unlock_irq(&tree->lock);
	return entry;
}

1021
/*
D
David Sterba 已提交
1022 1023
 * Lock the passed range and ensures all pending ordered extents in it are run
 * to completion.
1024 1025 1026 1027 1028
 *
 * @inode:        Inode whose ordered tree is to be searched
 * @start:        Beginning of range to flush
 * @end:          Last byte of range to lock
 * @cached_state: If passed, will return the extent state responsible for the
D
David Sterba 已提交
1029 1030
 *                locked range. It's the caller's responsibility to free the
 *                cached state.
1031
 *
D
David Sterba 已提交
1032 1033
 * Always return with the given range locked, ensuring after it's called no
 * order extent can be pending.
1034
 */
1035
void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
1036 1037 1038 1039
					u64 end,
					struct extent_state **cached_state)
{
	struct btrfs_ordered_extent *ordered;
1040 1041
	struct extent_state *cache = NULL;
	struct extent_state **cachedp = &cache;
1042 1043

	if (cached_state)
1044
		cachedp = cached_state;
1045 1046

	while (1) {
1047
		lock_extent(&inode->io_tree, start, end, cachedp);
1048 1049
		ordered = btrfs_lookup_ordered_range(inode, start,
						     end - start + 1);
1050 1051 1052 1053 1054 1055 1056
		if (!ordered) {
			/*
			 * If no external cached_state has been passed then
			 * decrement the extra ref taken for cachedp since we
			 * aren't exposing it outside of this function
			 */
			if (!cached_state)
1057
				refcount_dec(&cache->refs);
1058
			break;
1059
		}
1060
		unlock_extent(&inode->io_tree, start, end, cachedp);
1061
		btrfs_start_ordered_extent(ordered);
1062 1063 1064 1065
		btrfs_put_ordered_extent(ordered);
	}
}

1066 1067 1068 1069 1070 1071 1072
/*
 * Lock the passed range and ensure all pending ordered extents in it are run
 * to completion in nowait mode.
 *
 * Return true if btrfs_lock_ordered_range does not return any extents,
 * otherwise false.
 */
1073 1074
bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
				  struct extent_state **cached_state)
1075 1076 1077
{
	struct btrfs_ordered_extent *ordered;

1078
	if (!try_lock_extent(&inode->io_tree, start, end, cached_state))
1079 1080 1081 1082 1083 1084 1085
		return false;

	ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1);
	if (!ordered)
		return true;

	btrfs_put_ordered_extent(ordered);
1086
	unlock_extent(&inode->io_tree, start, end, cached_state);
1087 1088 1089 1090 1091

	return false;
}


1092 1093 1094 1095
static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos,
				u64 len)
{
	struct inode *inode = ordered->inode;
1096
	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
1097 1098
	u64 file_offset = ordered->file_offset + pos;
	u64 disk_bytenr = ordered->disk_bytenr + pos;
1099
	unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS;
1100

1101
	/*
1102 1103
	 * The splitting extent is already counted and will be added again in
	 * btrfs_add_ordered_extent_*(). Subtract len to avoid double counting.
1104
	 */
1105
	percpu_counter_add_batch(&fs_info->ordered_bytes, -len,
1106
				 fs_info->delalloc_batch);
1107 1108 1109 1110
	WARN_ON_ONCE(flags & (1 << BTRFS_ORDERED_COMPRESSED));
	return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len,
					disk_bytenr, len, 0, flags,
					ordered->compress_type);
1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121
}

int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
				u64 post)
{
	struct inode *inode = ordered->inode;
	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
	struct rb_node *node;
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	int ret = 0;

1122 1123
	trace_btrfs_ordered_extent_split(BTRFS_I(inode), ordered);

1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148
	spin_lock_irq(&tree->lock);
	/* Remove from tree once */
	node = &ordered->rb_node;
	rb_erase(node, &tree->tree);
	RB_CLEAR_NODE(node);
	if (tree->last == node)
		tree->last = NULL;

	ordered->file_offset += pre;
	ordered->disk_bytenr += pre;
	ordered->num_bytes -= (pre + post);
	ordered->disk_num_bytes -= (pre + post);
	ordered->bytes_left -= (pre + post);

	/* Re-insert the node */
	node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node);
	if (node)
		btrfs_panic(fs_info, -EEXIST,
			"zoned: inconsistency in ordered tree at offset %llu",
			    ordered->file_offset);

	spin_unlock_irq(&tree->lock);

	if (pre)
		ret = clone_ordered_extent(ordered, 0, pre);
1149
	if (ret == 0 && post)
1150 1151 1152 1153 1154 1155
		ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes,
					   post);

	return ret;
}

1156 1157 1158 1159
int __init ordered_data_init(void)
{
	btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
				     sizeof(struct btrfs_ordered_extent), 0,
1160
				     SLAB_MEM_SPREAD,
1161 1162 1163
				     NULL);
	if (!btrfs_ordered_extent_cache)
		return -ENOMEM;
1164

1165 1166 1167
	return 0;
}

1168
void __cold ordered_data_exit(void)
1169
{
1170
	kmem_cache_destroy(btrfs_ordered_extent_cache);
1171
}