ordered-data.c 31.4 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
C
Chris Mason 已提交
2 3 4 5 6
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

#include <linux/slab.h>
7
#include <linux/blkdev.h>
8
#include <linux/writeback.h>
9
#include <linux/sched/mm.h>
10
#include "misc.h"
C
Chris Mason 已提交
11 12 13
#include "ctree.h"
#include "transaction.h"
#include "btrfs_inode.h"
14
#include "extent_io.h"
15
#include "disk-io.h"
16
#include "compression.h"
17
#include "delalloc-space.h"
18
#include "qgroup.h"
19
#include "subpage.h"
C
Chris Mason 已提交
20

21 22
static struct kmem_cache *btrfs_ordered_extent_cache;

23
static u64 entry_end(struct btrfs_ordered_extent *entry)
C
Chris Mason 已提交
24
{
25
	if (entry->file_offset + entry->num_bytes < entry->file_offset)
26
		return (u64)-1;
27
	return entry->file_offset + entry->num_bytes;
C
Chris Mason 已提交
28 29
}

C
Chris Mason 已提交
30 31 32
/* returns NULL if the insertion worked, or it returns the node it did find
 * in the tree
 */
33 34
static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
				   struct rb_node *node)
C
Chris Mason 已提交
35
{
C
Chris Mason 已提交
36 37
	struct rb_node **p = &root->rb_node;
	struct rb_node *parent = NULL;
38
	struct btrfs_ordered_extent *entry;
C
Chris Mason 已提交
39

C
Chris Mason 已提交
40
	while (*p) {
C
Chris Mason 已提交
41
		parent = *p;
42
		entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
C
Chris Mason 已提交
43

44
		if (file_offset < entry->file_offset)
C
Chris Mason 已提交
45
			p = &(*p)->rb_left;
46
		else if (file_offset >= entry_end(entry))
C
Chris Mason 已提交
47 48 49 50 51 52 53 54 55 56
			p = &(*p)->rb_right;
		else
			return parent;
	}

	rb_link_node(node, parent, p);
	rb_insert_color(node, root);
	return NULL;
}

C
Chris Mason 已提交
57 58 59 60
/*
 * look for a given offset in the tree, and if it can't be found return the
 * first lesser offset
 */
61 62
static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
				     struct rb_node **prev_ret)
C
Chris Mason 已提交
63
{
C
Chris Mason 已提交
64
	struct rb_node *n = root->rb_node;
C
Chris Mason 已提交
65
	struct rb_node *prev = NULL;
66 67 68
	struct rb_node *test;
	struct btrfs_ordered_extent *entry;
	struct btrfs_ordered_extent *prev_entry = NULL;
C
Chris Mason 已提交
69

C
Chris Mason 已提交
70
	while (n) {
71
		entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
C
Chris Mason 已提交
72 73 74
		prev = n;
		prev_entry = entry;

75
		if (file_offset < entry->file_offset)
C
Chris Mason 已提交
76
			n = n->rb_left;
77
		else if (file_offset >= entry_end(entry))
C
Chris Mason 已提交
78 79 80 81 82 83 84
			n = n->rb_right;
		else
			return n;
	}
	if (!prev_ret)
		return NULL;

C
Chris Mason 已提交
85
	while (prev && file_offset >= entry_end(prev_entry)) {
86 87 88 89 90 91 92 93 94 95 96 97 98
		test = rb_next(prev);
		if (!test)
			break;
		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
				      rb_node);
		if (file_offset < entry_end(prev_entry))
			break;

		prev = test;
	}
	if (prev)
		prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
				      rb_node);
C
Chris Mason 已提交
99
	while (prev && file_offset < entry_end(prev_entry)) {
100 101 102 103 104 105
		test = rb_prev(prev);
		if (!test)
			break;
		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
				      rb_node);
		prev = test;
C
Chris Mason 已提交
106 107 108 109 110
	}
	*prev_ret = prev;
	return NULL;
}

111 112 113 114
static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
			  u64 len)
{
	if (file_offset + len <= entry->file_offset ||
115
	    entry->file_offset + entry->num_bytes <= file_offset)
116 117 118 119
		return 0;
	return 1;
}

C
Chris Mason 已提交
120 121 122 123
/*
 * look find the first ordered struct that has this offset, otherwise
 * the first one less than this offset
 */
124 125
static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
					  u64 file_offset)
C
Chris Mason 已提交
126
{
127
	struct rb_root *root = &tree->tree;
128
	struct rb_node *prev = NULL;
C
Chris Mason 已提交
129
	struct rb_node *ret;
130 131 132 133 134
	struct btrfs_ordered_extent *entry;

	if (tree->last) {
		entry = rb_entry(tree->last, struct btrfs_ordered_extent,
				 rb_node);
135
		if (in_range(file_offset, entry->file_offset, entry->num_bytes))
136 137 138
			return tree->last;
	}
	ret = __tree_search(root, file_offset, &prev);
C
Chris Mason 已提交
139
	if (!ret)
140 141 142
		ret = prev;
	if (ret)
		tree->last = ret;
C
Chris Mason 已提交
143 144 145
	return ret;
}

146 147
/*
 * Allocate and add a new ordered_extent into the per-inode tree.
148 149 150 151
 *
 * The tree is given a single reference on the ordered extent that was
 * inserted.
 */
152
static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
153 154 155
				      u64 disk_bytenr, u64 num_bytes,
				      u64 disk_num_bytes, int type, int dio,
				      int compress_type)
C
Chris Mason 已提交
156
{
157 158 159
	struct btrfs_root *root = inode->root;
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
160 161
	struct rb_node *node;
	struct btrfs_ordered_extent *entry;
162 163 164 165
	int ret;

	if (type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_PREALLOC) {
		/* For nocow write, we can release the qgroup rsv right now */
166
		ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
167 168 169 170 171 172 173 174
		if (ret < 0)
			return ret;
		ret = 0;
	} else {
		/*
		 * The ordered extent has reserved qgroup space, release now
		 * and pass the reserved number for qgroup_record to free.
		 */
175
		ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes);
176 177 178
		if (ret < 0)
			return ret;
	}
179
	entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
C
Chris Mason 已提交
180 181 182
	if (!entry)
		return -ENOMEM;

183
	entry->file_offset = file_offset;
184 185 186 187
	entry->disk_bytenr = disk_bytenr;
	entry->num_bytes = num_bytes;
	entry->disk_num_bytes = disk_num_bytes;
	entry->bytes_left = num_bytes;
188
	entry->inode = igrab(&inode->vfs_inode);
189
	entry->compress_type = compress_type;
190
	entry->truncated_len = (u64)-1;
191
	entry->qgroup_rsv = ret;
192 193 194
	entry->physical = (u64)-1;
	entry->disk = NULL;
	entry->partno = (u8)-1;
195 196 197 198 199 200

	ASSERT(type == BTRFS_ORDERED_REGULAR ||
	       type == BTRFS_ORDERED_NOCOW ||
	       type == BTRFS_ORDERED_PREALLOC ||
	       type == BTRFS_ORDERED_COMPRESSED);
	set_bit(type, &entry->flags);
201

202 203 204 205
	percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes,
				 fs_info->delalloc_batch);

	if (dio)
206 207
		set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);

208
	/* one ref for the tree */
209
	refcount_set(&entry->refs, 1);
210 211
	init_waitqueue_head(&entry->wait);
	INIT_LIST_HEAD(&entry->list);
212
	INIT_LIST_HEAD(&entry->log_list);
213
	INIT_LIST_HEAD(&entry->root_extent_list);
214 215
	INIT_LIST_HEAD(&entry->work_list);
	init_completion(&entry->completion);
C
Chris Mason 已提交
216

217
	trace_btrfs_ordered_extent_add(inode, entry);
218

219
	spin_lock_irq(&tree->lock);
220 221
	node = tree_insert(&tree->tree, file_offset,
			   &entry->rb_node);
222
	if (node)
223 224 225
		btrfs_panic(fs_info, -EEXIST,
				"inconsistency in ordered tree at offset %llu",
				file_offset);
226
	spin_unlock_irq(&tree->lock);
C
Chris Mason 已提交
227

228
	spin_lock(&root->ordered_extent_lock);
229
	list_add_tail(&entry->root_extent_list,
230 231 232
		      &root->ordered_extents);
	root->nr_ordered_extents++;
	if (root->nr_ordered_extents == 1) {
233
		spin_lock(&fs_info->ordered_root_lock);
234
		BUG_ON(!list_empty(&root->ordered_root));
235 236
		list_add_tail(&root->ordered_root, &fs_info->ordered_roots);
		spin_unlock(&fs_info->ordered_root_lock);
237 238
	}
	spin_unlock(&root->ordered_extent_lock);
239

J
Josef Bacik 已提交
240 241 242 243 244
	/*
	 * We don't need the count_max_extents here, we can assume that all of
	 * that work has been done at higher layers, so this is truly the
	 * smallest the extent is going to get.
	 */
245 246 247
	spin_lock(&inode->lock);
	btrfs_mod_outstanding_extents(inode, 1);
	spin_unlock(&inode->lock);
J
Josef Bacik 已提交
248

C
Chris Mason 已提交
249 250 251
	return 0;
}

252
int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
253 254
			     u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
			     int type)
255
{
256 257 258
	ASSERT(type == BTRFS_ORDERED_REGULAR ||
	       type == BTRFS_ORDERED_NOCOW ||
	       type == BTRFS_ORDERED_PREALLOC);
259
	return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
260
					  num_bytes, disk_num_bytes, type, 0,
261
					  BTRFS_COMPRESS_NONE);
262 263
}

264
int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset,
265 266
				 u64 disk_bytenr, u64 num_bytes,
				 u64 disk_num_bytes, int type)
267
{
268 269 270
	ASSERT(type == BTRFS_ORDERED_REGULAR ||
	       type == BTRFS_ORDERED_NOCOW ||
	       type == BTRFS_ORDERED_PREALLOC);
271
	return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
272
					  num_bytes, disk_num_bytes, type, 1,
273 274 275
					  BTRFS_COMPRESS_NONE);
}

276
int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset,
277
				      u64 disk_bytenr, u64 num_bytes,
278
				      u64 disk_num_bytes, int compress_type)
279
{
280
	ASSERT(compress_type != BTRFS_COMPRESS_NONE);
281
	return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
282 283
					  num_bytes, disk_num_bytes,
					  BTRFS_ORDERED_COMPRESSED, 0,
284
					  compress_type);
285 286
}

287 288
/*
 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
289 290
 * when an ordered extent is finished.  If the list covers more than one
 * ordered extent, it is split across multiples.
291
 */
292
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
293
			   struct btrfs_ordered_sum *sum)
C
Chris Mason 已提交
294
{
295
	struct btrfs_ordered_inode_tree *tree;
C
Chris Mason 已提交
296

297
	tree = &BTRFS_I(entry->inode)->ordered_tree;
298
	spin_lock_irq(&tree->lock);
299
	list_add_tail(&sum->list, &entry->list);
300
	spin_unlock_irq(&tree->lock);
C
Chris Mason 已提交
301 302
}

303
/*
304
 * Mark all ordered extents io inside the specified range finished.
305
 *
306 307 308 309 310 311 312 313
 * @page:	 The invovled page for the opeartion.
 *		 For uncompressed buffered IO, the page status also needs to be
 *		 updated to indicate whether the pending ordered io is finished.
 *		 Can be NULL for direct IO and compressed write.
 *		 For these cases, callers are ensured they won't execute the
 *		 endio function twice.
 * @finish_func: The function to be executed when all the IO of an ordered
 *		 extent are finished.
314
 *
315 316
 * This function is called for endio, thus the range must have ordered
 * extent(s) coveri it.
317
 */
318 319 320 321
void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
				struct page *page, u64 file_offset,
				u64 num_bytes, btrfs_func_t finish_func,
				bool uptodate)
322
{
323
	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
324 325
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
	struct btrfs_workqueue *wq;
326 327
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;
328
	unsigned long flags;
329 330 331 332 333 334 335 336 337 338
	u64 cur = file_offset;

	if (btrfs_is_free_space_inode(inode))
		wq = fs_info->endio_freespace_worker;
	else
		wq = fs_info->endio_write_workers;

	if (page)
		ASSERT(page->mapping && page_offset(page) <= file_offset &&
		       file_offset + num_bytes <= page_offset(page) + PAGE_SIZE);
339

340
	spin_lock_irqsave(&tree->lock, flags);
341 342 343 344 345 346 347 348 349
	while (cur < file_offset + num_bytes) {
		u64 entry_end;
		u64 end;
		u32 len;

		node = tree_search(tree, cur);
		/* No ordered extents at all */
		if (!node)
			break;
350

351 352
		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
		entry_end = entry->file_offset + entry->num_bytes;
353
		/*
354 355 356
		 * |<-- OE --->|  |
		 *		  cur
		 * Go to next OE.
357
		 */
358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393
		if (cur >= entry_end) {
			node = rb_next(node);
			/* No more ordered extents, exit */
			if (!node)
				break;
			entry = rb_entry(node, struct btrfs_ordered_extent,
					 rb_node);

			/* Go to next ordered extent and continue */
			cur = entry->file_offset;
			continue;
		}
		/*
		 * |	|<--- OE --->|
		 * cur
		 * Go to the start of OE.
		 */
		if (cur < entry->file_offset) {
			cur = entry->file_offset;
			continue;
		}

		/*
		 * Now we are definitely inside one ordered extent.
		 *
		 * |<--- OE --->|
		 *	|
		 *	cur
		 */
		end = min(entry->file_offset + entry->num_bytes,
			  file_offset + num_bytes) - 1;
		ASSERT(end + 1 - cur < U32_MAX);
		len = end + 1 - cur;

		if (page) {
			/*
394 395
			 * Ordered (Private2) bit indicates whether we still
			 * have pending io unfinished for the ordered extent.
396 397 398
			 *
			 * If there's no such bit, we need to skip to next range.
			 */
399
			if (!btrfs_page_test_ordered(fs_info, page, cur, len)) {
400 401 402
				cur += len;
				continue;
			}
403
			btrfs_page_clear_ordered(fs_info, page, cur, len);
404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437
		}

		/* Now we're fine to update the accounting */
		if (unlikely(len > entry->bytes_left)) {
			WARN_ON(1);
			btrfs_crit(fs_info,
"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%u left=%llu",
				   inode->root->root_key.objectid,
				   btrfs_ino(inode),
				   entry->file_offset,
				   entry->num_bytes,
				   len, entry->bytes_left);
			entry->bytes_left = 0;
		} else {
			entry->bytes_left -= len;
		}

		if (!uptodate)
			set_bit(BTRFS_ORDERED_IOERR, &entry->flags);

		/*
		 * All the IO of the ordered extent is finished, we need to queue
		 * the finish_func to be executed.
		 */
		if (entry->bytes_left == 0) {
			set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
			cond_wake_up(&entry->wait);
			refcount_inc(&entry->refs);
			spin_unlock_irqrestore(&tree->lock, flags);
			btrfs_init_work(&entry->work, finish_func, NULL, NULL);
			btrfs_queue_work(wq, &entry->work);
			spin_lock_irqsave(&tree->lock, flags);
		}
		cur += len;
438
	}
439
	spin_unlock_irqrestore(&tree->lock, flags);
440 441
}

442
/*
443 444 445 446 447 448 449 450 451
 * Finish IO for one ordered extent across a given range.  The range can only
 * contain one ordered extent.
 *
 * @cached:	 The cached ordered extent. If not NULL, we can skip the tree
 *               search and use the ordered extent directly.
 * 		 Will be also used to store the finished ordered extent.
 * @file_offset: File offset for the finished IO
 * @io_size:	 Length of the finish IO range
 * @uptodate:	 If the IO finishes without problem
452
 *
453 454 455 456 457 458
 * Return true if the ordered extent is finished in the range, and update
 * @cached.
 * Return false otherwise.
 *
 * NOTE: The range can NOT cross multiple ordered extents.
 * Thus caller should ensure the range doesn't cross ordered extents.
459
 */
460 461 462
bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
				    struct btrfs_ordered_extent **cached,
				    u64 file_offset, u64 io_size, int uptodate)
C
Chris Mason 已提交
463
{
464
	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
C
Chris Mason 已提交
465
	struct rb_node *node;
466
	struct btrfs_ordered_extent *entry = NULL;
467
	unsigned long flags;
468
	bool finished = false;
469

470 471 472 473 474 475
	spin_lock_irqsave(&tree->lock, flags);
	if (cached && *cached) {
		entry = *cached;
		goto have_entry;
	}

476
	node = tree_search(tree, file_offset);
477
	if (!node)
478
		goto out;
C
Chris Mason 已提交
479

480
	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
481
have_entry:
482
	if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
483 484
		goto out;

485
	if (io_size > entry->bytes_left)
486
		btrfs_crit(inode->root->fs_info,
487
			   "bad ordered accounting left %llu size %llu",
488
		       entry->bytes_left, io_size);
489

490
	entry->bytes_left -= io_size;
491 492 493
	if (!uptodate)
		set_bit(BTRFS_ORDERED_IOERR, &entry->flags);

494
	if (entry->bytes_left == 0) {
495 496 497 498 499
		/*
		 * Ensure only one caller can set the flag and finished_ret
		 * accordingly
		 */
		finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
500 501
		/* test_and_set_bit implies a barrier */
		cond_wake_up_nomb(&entry->wait);
502
	}
503
out:
504
	if (finished && cached && entry) {
505
		*cached = entry;
506
		refcount_inc(&entry->refs);
507
	}
508
	spin_unlock_irqrestore(&tree->lock, flags);
509
	return finished;
510
}
C
Chris Mason 已提交
511

512 513 514 515
/*
 * used to drop a reference on an ordered extent.  This will free
 * the extent if the last reference is dropped
 */
516
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
517
{
518 519 520
	struct list_head *cur;
	struct btrfs_ordered_sum *sum;

521
	trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry);
522

523
	if (refcount_dec_and_test(&entry->refs)) {
524
		ASSERT(list_empty(&entry->root_extent_list));
525
		ASSERT(list_empty(&entry->log_list));
526
		ASSERT(RB_EMPTY_NODE(&entry->rb_node));
527 528
		if (entry->inode)
			btrfs_add_delayed_iput(entry->inode);
C
Chris Mason 已提交
529
		while (!list_empty(&entry->list)) {
530 531 532
			cur = entry->list.next;
			sum = list_entry(cur, struct btrfs_ordered_sum, list);
			list_del(&sum->list);
533
			kvfree(sum);
534
		}
535
		kmem_cache_free(btrfs_ordered_extent_cache, entry);
536
	}
C
Chris Mason 已提交
537
}
538

539 540
/*
 * remove an ordered extent from the tree.  No references are dropped
541
 * and waiters are woken up.
542
 */
543
void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
544
				 struct btrfs_ordered_extent *entry)
545
{
546
	struct btrfs_ordered_inode_tree *tree;
J
Josef Bacik 已提交
547
	struct btrfs_root *root = btrfs_inode->root;
548
	struct btrfs_fs_info *fs_info = root->fs_info;
549
	struct rb_node *node;
550
	bool pending;
551

J
Josef Bacik 已提交
552 553 554 555 556
	/* This is paired with btrfs_add_ordered_extent. */
	spin_lock(&btrfs_inode->lock);
	btrfs_mod_outstanding_extents(btrfs_inode, -1);
	spin_unlock(&btrfs_inode->lock);
	if (root != fs_info->tree_root)
557 558
		btrfs_delalloc_release_metadata(btrfs_inode, entry->num_bytes,
						false);
J
Josef Bacik 已提交
559

560 561
	percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
				 fs_info->delalloc_batch);
J
Josef Bacik 已提交
562

J
Josef Bacik 已提交
563
	tree = &btrfs_inode->ordered_tree;
564
	spin_lock_irq(&tree->lock);
565
	node = &entry->rb_node;
566
	rb_erase(node, &tree->tree);
567
	RB_CLEAR_NODE(node);
568 569
	if (tree->last == node)
		tree->last = NULL;
570
	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
571
	pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
572
	spin_unlock_irq(&tree->lock);
573

574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600
	/*
	 * The current running transaction is waiting on us, we need to let it
	 * know that we're complete and wake it up.
	 */
	if (pending) {
		struct btrfs_transaction *trans;

		/*
		 * The checks for trans are just a formality, it should be set,
		 * but if it isn't we don't want to deref/assert under the spin
		 * lock, so be nice and check if trans is set, but ASSERT() so
		 * if it isn't set a developer will notice.
		 */
		spin_lock(&fs_info->trans_lock);
		trans = fs_info->running_transaction;
		if (trans)
			refcount_inc(&trans->use_count);
		spin_unlock(&fs_info->trans_lock);

		ASSERT(trans);
		if (trans) {
			if (atomic_dec_and_test(&trans->pending_ordered))
				wake_up(&trans->pending_wait);
			btrfs_put_transaction(trans);
		}
	}

601
	spin_lock(&root->ordered_extent_lock);
602
	list_del_init(&entry->root_extent_list);
603
	root->nr_ordered_extents--;
604

605
	trace_btrfs_ordered_extent_remove(btrfs_inode, entry);
606

607
	if (!root->nr_ordered_extents) {
608
		spin_lock(&fs_info->ordered_root_lock);
609 610
		BUG_ON(list_empty(&root->ordered_root));
		list_del_init(&root->ordered_root);
611
		spin_unlock(&fs_info->ordered_root_lock);
612 613
	}
	spin_unlock(&root->ordered_extent_lock);
614
	wake_up(&entry->wait);
615 616
}

617
static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
618 619 620 621
{
	struct btrfs_ordered_extent *ordered;

	ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
622
	btrfs_start_ordered_extent(ordered, 1);
623 624 625
	complete(&ordered->completion);
}

C
Chris Mason 已提交
626 627 628 629
/*
 * wait for all the ordered extents in a root.  This is done when balancing
 * space between drives.
 */
630
u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
631
			       const u64 range_start, const u64 range_len)
632
{
633
	struct btrfs_fs_info *fs_info = root->fs_info;
634 635 636
	LIST_HEAD(splice);
	LIST_HEAD(skipped);
	LIST_HEAD(works);
637
	struct btrfs_ordered_extent *ordered, *next;
638
	u64 count = 0;
639
	const u64 range_end = range_start + range_len;
640

641
	mutex_lock(&root->ordered_extent_mutex);
642 643
	spin_lock(&root->ordered_extent_lock);
	list_splice_init(&root->ordered_extents, &splice);
644
	while (!list_empty(&splice) && nr) {
645 646
		ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
					   root_extent_list);
647

648 649
		if (range_end <= ordered->disk_bytenr ||
		    ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) {
650 651 652 653 654
			list_move_tail(&ordered->root_extent_list, &skipped);
			cond_resched_lock(&root->ordered_extent_lock);
			continue;
		}

655 656
		list_move_tail(&ordered->root_extent_list,
			       &root->ordered_extents);
657
		refcount_inc(&ordered->refs);
658
		spin_unlock(&root->ordered_extent_lock);
659

660 661
		btrfs_init_work(&ordered->flush_work,
				btrfs_run_ordered_extent_work, NULL, NULL);
662
		list_add_tail(&ordered->work_list, &works);
663
		btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work);
664

665
		cond_resched();
666
		spin_lock(&root->ordered_extent_lock);
667
		if (nr != U64_MAX)
668 669
			nr--;
		count++;
670
	}
671
	list_splice_tail(&skipped, &root->ordered_extents);
672
	list_splice_tail(&splice, &root->ordered_extents);
673
	spin_unlock(&root->ordered_extent_lock);
674 675 676 677 678 679 680

	list_for_each_entry_safe(ordered, next, &works, work_list) {
		list_del_init(&ordered->work_list);
		wait_for_completion(&ordered->completion);
		btrfs_put_ordered_extent(ordered);
		cond_resched();
	}
681
	mutex_unlock(&root->ordered_extent_mutex);
682 683

	return count;
684 685
}

686
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
687
			     const u64 range_start, const u64 range_len)
688 689 690
{
	struct btrfs_root *root;
	struct list_head splice;
691
	u64 done;
692 693 694

	INIT_LIST_HEAD(&splice);

695
	mutex_lock(&fs_info->ordered_operations_mutex);
696 697
	spin_lock(&fs_info->ordered_root_lock);
	list_splice_init(&fs_info->ordered_roots, &splice);
698
	while (!list_empty(&splice) && nr) {
699 700
		root = list_first_entry(&splice, struct btrfs_root,
					ordered_root);
701
		root = btrfs_grab_root(root);
702 703 704 705 706
		BUG_ON(!root);
		list_move_tail(&root->ordered_root,
			       &fs_info->ordered_roots);
		spin_unlock(&fs_info->ordered_root_lock);

707 708
		done = btrfs_wait_ordered_extents(root, nr,
						  range_start, range_len);
709
		btrfs_put_root(root);
710 711

		spin_lock(&fs_info->ordered_root_lock);
712
		if (nr != U64_MAX) {
713 714
			nr -= done;
		}
715
	}
716
	list_splice_tail(&splice, &fs_info->ordered_roots);
717
	spin_unlock(&fs_info->ordered_root_lock);
718
	mutex_unlock(&fs_info->ordered_operations_mutex);
719 720
}

721 722 723 724 725 726 727
/*
 * Used to start IO or wait for a given ordered extent to finish.
 *
 * If wait is one, this effectively waits on page writeback for all the pages
 * in the extent, and it waits on the io completion code to insert
 * metadata into the btree corresponding to the extent
 */
728
void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
729 730
{
	u64 start = entry->file_offset;
731
	u64 end = start + entry->num_bytes - 1;
732
	struct btrfs_inode *inode = BTRFS_I(entry->inode);
733

734
	trace_btrfs_ordered_extent_start(inode, entry);
735

736 737 738
	/*
	 * pages in the range can be dirty, clean or writeback.  We
	 * start IO on any dirty ones so the wait doesn't stall waiting
739
	 * for the flusher thread to find them
740
	 */
741
	if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
742
		filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
C
Chris Mason 已提交
743
	if (wait) {
744 745
		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
						 &entry->flags));
C
Chris Mason 已提交
746
	}
747
}
748

749 750 751
/*
 * Used to wait on ordered extents across a large range of bytes.
 */
752
int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
753
{
754
	int ret = 0;
755
	int ret_wb = 0;
756
	u64 end;
757
	u64 orig_end;
758
	struct btrfs_ordered_extent *ordered;
759 760

	if (start + len < start) {
761
		orig_end = INT_LIMIT(loff_t);
762 763
	} else {
		orig_end = start + len - 1;
764 765
		if (orig_end > INT_LIMIT(loff_t))
			orig_end = INT_LIMIT(loff_t);
766
	}
767

768 769 770
	/* start IO across the range first to instantiate any delalloc
	 * extents
	 */
771
	ret = btrfs_fdatawrite_range(inode, start, orig_end);
772 773
	if (ret)
		return ret;
774

775 776 777 778 779 780 781 782
	/*
	 * If we have a writeback error don't return immediately. Wait first
	 * for any ordered extents that haven't completed yet. This is to make
	 * sure no one can dirty the same page ranges and call writepages()
	 * before the ordered extents complete - to avoid failures (-EEXIST)
	 * when adding the new ordered extents to the ordered tree.
	 */
	ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
783

784
	end = orig_end;
C
Chris Mason 已提交
785
	while (1) {
786
		ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end);
C
Chris Mason 已提交
787
		if (!ordered)
788
			break;
789
		if (ordered->file_offset > orig_end) {
790 791 792
			btrfs_put_ordered_extent(ordered);
			break;
		}
793
		if (ordered->file_offset + ordered->num_bytes <= start) {
794 795 796
			btrfs_put_ordered_extent(ordered);
			break;
		}
797
		btrfs_start_ordered_extent(ordered, 1);
798
		end = ordered->file_offset;
799 800 801 802 803
		/*
		 * If the ordered extent had an error save the error but don't
		 * exit without waiting first for all other ordered extents in
		 * the range to complete.
		 */
804 805
		if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
			ret = -EIO;
806
		btrfs_put_ordered_extent(ordered);
807
		if (end == 0 || end == start)
808 809 810
			break;
		end--;
	}
811
	return ret_wb ? ret_wb : ret;
812 813
}

814 815 816 817
/*
 * find an ordered extent corresponding to file_offset.  return NULL if
 * nothing is found, otherwise take a reference on the extent and return it
 */
818
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
819 820 821 822 823
							 u64 file_offset)
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;
824
	unsigned long flags;
825

826
	tree = &inode->ordered_tree;
827
	spin_lock_irqsave(&tree->lock, flags);
828 829 830 831 832
	node = tree_search(tree, file_offset);
	if (!node)
		goto out;

	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
833
	if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
834 835
		entry = NULL;
	if (entry)
836
		refcount_inc(&entry->refs);
837
out:
838
	spin_unlock_irqrestore(&tree->lock, flags);
839 840 841
	return entry;
}

842 843 844
/* Since the DIO code tries to lock a wide area we need to look for any ordered
 * extents that exist in the range, rather than just the start of the range.
 */
845 846
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
		struct btrfs_inode *inode, u64 file_offset, u64 len)
847 848 849 850 851
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;

852
	tree = &inode->ordered_tree;
853
	spin_lock_irq(&tree->lock);
854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876
	node = tree_search(tree, file_offset);
	if (!node) {
		node = tree_search(tree, file_offset + len);
		if (!node)
			goto out;
	}

	while (1) {
		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
		if (range_overlaps(entry, file_offset, len))
			break;

		if (entry->file_offset >= file_offset + len) {
			entry = NULL;
			break;
		}
		entry = NULL;
		node = rb_next(node);
		if (!node)
			break;
	}
out:
	if (entry)
877
		refcount_inc(&entry->refs);
878
	spin_unlock_irq(&tree->lock);
879 880 881
	return entry;
}

882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909
/*
 * Adds all ordered extents to the given list. The list ends up sorted by the
 * file_offset of the ordered extents.
 */
void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
					   struct list_head *list)
{
	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
	struct rb_node *n;

	ASSERT(inode_is_locked(&inode->vfs_inode));

	spin_lock_irq(&tree->lock);
	for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
		struct btrfs_ordered_extent *ordered;

		ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);

		if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
			continue;

		ASSERT(list_empty(&ordered->log_list));
		list_add_tail(&ordered->log_list, list);
		refcount_inc(&ordered->refs);
	}
	spin_unlock_irq(&tree->lock);
}

910 911 912 913
/*
 * lookup and return any extent before 'file_offset'.  NULL is returned
 * if none is found
 */
914
struct btrfs_ordered_extent *
915
btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
916 917 918 919 920
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;

921
	tree = &inode->ordered_tree;
922
	spin_lock_irq(&tree->lock);
923 924 925 926 927
	node = tree_search(tree, file_offset);
	if (!node)
		goto out;

	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
928
	refcount_inc(&entry->refs);
929
out:
930
	spin_unlock_irq(&tree->lock);
931
	return entry;
932
}
933

934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008
/*
 * Lookup the first ordered extent that overlaps the range
 * [@file_offset, @file_offset + @len).
 *
 * The difference between this and btrfs_lookup_first_ordered_extent() is
 * that this one won't return any ordered extent that does not overlap the range.
 * And the difference against btrfs_lookup_ordered_extent() is, this function
 * ensures the first ordered extent gets returned.
 */
struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
			struct btrfs_inode *inode, u64 file_offset, u64 len)
{
	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
	struct rb_node *node;
	struct rb_node *cur;
	struct rb_node *prev;
	struct rb_node *next;
	struct btrfs_ordered_extent *entry = NULL;

	spin_lock_irq(&tree->lock);
	node = tree->tree.rb_node;
	/*
	 * Here we don't want to use tree_search() which will use tree->last
	 * and screw up the search order.
	 * And __tree_search() can't return the adjacent ordered extents
	 * either, thus here we do our own search.
	 */
	while (node) {
		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);

		if (file_offset < entry->file_offset) {
			node = node->rb_left;
		} else if (file_offset >= entry_end(entry)) {
			node = node->rb_right;
		} else {
			/*
			 * Direct hit, got an ordered extent that starts at
			 * @file_offset
			 */
			goto out;
		}
	}
	if (!entry) {
		/* Empty tree */
		goto out;
	}

	cur = &entry->rb_node;
	/* We got an entry around @file_offset, check adjacent entries */
	if (entry->file_offset < file_offset) {
		prev = cur;
		next = rb_next(cur);
	} else {
		prev = rb_prev(cur);
		next = cur;
	}
	if (prev) {
		entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node);
		if (range_overlaps(entry, file_offset, len))
			goto out;
	}
	if (next) {
		entry = rb_entry(next, struct btrfs_ordered_extent, rb_node);
		if (range_overlaps(entry, file_offset, len))
			goto out;
	}
	/* No ordered extent in the range */
	entry = NULL;
out:
	if (entry)
		refcount_inc(&entry->refs);
	spin_unlock_irq(&tree->lock);
	return entry;
}

1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021
/*
 * btrfs_flush_ordered_range - Lock the passed range and ensures all pending
 * ordered extents in it are run to completion.
 *
 * @inode:        Inode whose ordered tree is to be searched
 * @start:        Beginning of range to flush
 * @end:          Last byte of range to lock
 * @cached_state: If passed, will return the extent state responsible for the
 * locked range. It's the caller's responsibility to free the cached state.
 *
 * This function always returns with the given range locked, ensuring after it's
 * called no order extent can be pending.
 */
1022
void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
1023 1024 1025 1026
					u64 end,
					struct extent_state **cached_state)
{
	struct btrfs_ordered_extent *ordered;
1027 1028
	struct extent_state *cache = NULL;
	struct extent_state **cachedp = &cache;
1029 1030

	if (cached_state)
1031
		cachedp = cached_state;
1032 1033

	while (1) {
1034
		lock_extent_bits(&inode->io_tree, start, end, cachedp);
1035 1036
		ordered = btrfs_lookup_ordered_range(inode, start,
						     end - start + 1);
1037 1038 1039 1040 1041 1042 1043
		if (!ordered) {
			/*
			 * If no external cached_state has been passed then
			 * decrement the extra ref taken for cachedp since we
			 * aren't exposing it outside of this function
			 */
			if (!cached_state)
1044
				refcount_dec(&cache->refs);
1045
			break;
1046
		}
1047
		unlock_extent_cached(&inode->io_tree, start, end, cachedp);
1048
		btrfs_start_ordered_extent(ordered, 1);
1049 1050 1051 1052
		btrfs_put_ordered_extent(ordered);
	}
}

1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123
static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos,
				u64 len)
{
	struct inode *inode = ordered->inode;
	u64 file_offset = ordered->file_offset + pos;
	u64 disk_bytenr = ordered->disk_bytenr + pos;
	u64 num_bytes = len;
	u64 disk_num_bytes = len;
	int type;
	unsigned long flags_masked = ordered->flags & ~(1 << BTRFS_ORDERED_DIRECT);
	int compress_type = ordered->compress_type;
	unsigned long weight;
	int ret;

	weight = hweight_long(flags_masked);
	WARN_ON_ONCE(weight > 1);
	if (!weight)
		type = 0;
	else
		type = __ffs(flags_masked);

	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered->flags)) {
		WARN_ON_ONCE(1);
		ret = btrfs_add_ordered_extent_compress(BTRFS_I(inode),
				file_offset, disk_bytenr, num_bytes,
				disk_num_bytes, compress_type);
	} else if (test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
		ret = btrfs_add_ordered_extent_dio(BTRFS_I(inode), file_offset,
				disk_bytenr, num_bytes, disk_num_bytes, type);
	} else {
		ret = btrfs_add_ordered_extent(BTRFS_I(inode), file_offset,
				disk_bytenr, num_bytes, disk_num_bytes, type);
	}

	return ret;
}

int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
				u64 post)
{
	struct inode *inode = ordered->inode;
	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
	struct rb_node *node;
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	int ret = 0;

	spin_lock_irq(&tree->lock);
	/* Remove from tree once */
	node = &ordered->rb_node;
	rb_erase(node, &tree->tree);
	RB_CLEAR_NODE(node);
	if (tree->last == node)
		tree->last = NULL;

	ordered->file_offset += pre;
	ordered->disk_bytenr += pre;
	ordered->num_bytes -= (pre + post);
	ordered->disk_num_bytes -= (pre + post);
	ordered->bytes_left -= (pre + post);

	/* Re-insert the node */
	node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node);
	if (node)
		btrfs_panic(fs_info, -EEXIST,
			"zoned: inconsistency in ordered tree at offset %llu",
			    ordered->file_offset);

	spin_unlock_irq(&tree->lock);

	if (pre)
		ret = clone_ordered_extent(ordered, 0, pre);
1124
	if (ret == 0 && post)
1125 1126 1127 1128 1129 1130
		ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes,
					   post);

	return ret;
}

1131 1132 1133 1134
int __init ordered_data_init(void)
{
	btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
				     sizeof(struct btrfs_ordered_extent), 0,
1135
				     SLAB_MEM_SPREAD,
1136 1137 1138
				     NULL);
	if (!btrfs_ordered_extent_cache)
		return -ENOMEM;
1139

1140 1141 1142
	return 0;
}

1143
void __cold ordered_data_exit(void)
1144
{
1145
	kmem_cache_destroy(btrfs_ordered_extent_cache);
1146
}