ordered-data.c 28.2 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
C
Chris Mason 已提交
2 3 4 5 6
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

#include <linux/slab.h>
7
#include <linux/blkdev.h>
8
#include <linux/writeback.h>
9
#include <linux/sched/mm.h>
10
#include "misc.h"
C
Chris Mason 已提交
11 12 13
#include "ctree.h"
#include "transaction.h"
#include "btrfs_inode.h"
14
#include "extent_io.h"
15
#include "disk-io.h"
16
#include "compression.h"
17
#include "delalloc-space.h"
18
#include "qgroup.h"
C
Chris Mason 已提交
19

20 21
static struct kmem_cache *btrfs_ordered_extent_cache;

22
static u64 entry_end(struct btrfs_ordered_extent *entry)
C
Chris Mason 已提交
23
{
24
	if (entry->file_offset + entry->num_bytes < entry->file_offset)
25
		return (u64)-1;
26
	return entry->file_offset + entry->num_bytes;
C
Chris Mason 已提交
27 28
}

C
Chris Mason 已提交
29 30 31
/* returns NULL if the insertion worked, or it returns the node it did find
 * in the tree
 */
32 33
static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
				   struct rb_node *node)
C
Chris Mason 已提交
34
{
C
Chris Mason 已提交
35 36
	struct rb_node **p = &root->rb_node;
	struct rb_node *parent = NULL;
37
	struct btrfs_ordered_extent *entry;
C
Chris Mason 已提交
38

C
Chris Mason 已提交
39
	while (*p) {
C
Chris Mason 已提交
40
		parent = *p;
41
		entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
C
Chris Mason 已提交
42

43
		if (file_offset < entry->file_offset)
C
Chris Mason 已提交
44
			p = &(*p)->rb_left;
45
		else if (file_offset >= entry_end(entry))
C
Chris Mason 已提交
46 47 48 49 50 51 52 53 54 55
			p = &(*p)->rb_right;
		else
			return parent;
	}

	rb_link_node(node, parent, p);
	rb_insert_color(node, root);
	return NULL;
}

C
Chris Mason 已提交
56 57 58 59
/*
 * look for a given offset in the tree, and if it can't be found return the
 * first lesser offset
 */
60 61
static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
				     struct rb_node **prev_ret)
C
Chris Mason 已提交
62
{
C
Chris Mason 已提交
63
	struct rb_node *n = root->rb_node;
C
Chris Mason 已提交
64
	struct rb_node *prev = NULL;
65 66 67
	struct rb_node *test;
	struct btrfs_ordered_extent *entry;
	struct btrfs_ordered_extent *prev_entry = NULL;
C
Chris Mason 已提交
68

C
Chris Mason 已提交
69
	while (n) {
70
		entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
C
Chris Mason 已提交
71 72 73
		prev = n;
		prev_entry = entry;

74
		if (file_offset < entry->file_offset)
C
Chris Mason 已提交
75
			n = n->rb_left;
76
		else if (file_offset >= entry_end(entry))
C
Chris Mason 已提交
77 78 79 80 81 82 83
			n = n->rb_right;
		else
			return n;
	}
	if (!prev_ret)
		return NULL;

C
Chris Mason 已提交
84
	while (prev && file_offset >= entry_end(prev_entry)) {
85 86 87 88 89 90 91 92 93 94 95 96 97
		test = rb_next(prev);
		if (!test)
			break;
		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
				      rb_node);
		if (file_offset < entry_end(prev_entry))
			break;

		prev = test;
	}
	if (prev)
		prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
				      rb_node);
C
Chris Mason 已提交
98
	while (prev && file_offset < entry_end(prev_entry)) {
99 100 101 102 103 104
		test = rb_prev(prev);
		if (!test)
			break;
		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
				      rb_node);
		prev = test;
C
Chris Mason 已提交
105 106 107 108 109
	}
	*prev_ret = prev;
	return NULL;
}

C
Chris Mason 已提交
110 111 112
/*
 * helper to check if a given offset is inside a given entry
 */
113 114 115
static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
{
	if (file_offset < entry->file_offset ||
116
	    entry->file_offset + entry->num_bytes <= file_offset)
117 118 119 120
		return 0;
	return 1;
}

121 122 123 124
static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
			  u64 len)
{
	if (file_offset + len <= entry->file_offset ||
125
	    entry->file_offset + entry->num_bytes <= file_offset)
126 127 128 129
		return 0;
	return 1;
}

C
Chris Mason 已提交
130 131 132 133
/*
 * look find the first ordered struct that has this offset, otherwise
 * the first one less than this offset
 */
134 135
static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
					  u64 file_offset)
C
Chris Mason 已提交
136
{
137
	struct rb_root *root = &tree->tree;
138
	struct rb_node *prev = NULL;
C
Chris Mason 已提交
139
	struct rb_node *ret;
140 141 142 143 144 145 146 147 148
	struct btrfs_ordered_extent *entry;

	if (tree->last) {
		entry = rb_entry(tree->last, struct btrfs_ordered_extent,
				 rb_node);
		if (offset_in_entry(entry, file_offset))
			return tree->last;
	}
	ret = __tree_search(root, file_offset, &prev);
C
Chris Mason 已提交
149
	if (!ret)
150 151 152
		ret = prev;
	if (ret)
		tree->last = ret;
C
Chris Mason 已提交
153 154 155
	return ret;
}

156 157
/*
 * Allocate and add a new ordered_extent into the per-inode tree.
158 159 160 161
 *
 * The tree is given a single reference on the ordered extent that was
 * inserted.
 */
162
static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
163 164 165
				      u64 disk_bytenr, u64 num_bytes,
				      u64 disk_num_bytes, int type, int dio,
				      int compress_type)
C
Chris Mason 已提交
166
{
167 168 169
	struct btrfs_root *root = inode->root;
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
170 171
	struct rb_node *node;
	struct btrfs_ordered_extent *entry;
172 173 174 175
	int ret;

	if (type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_PREALLOC) {
		/* For nocow write, we can release the qgroup rsv right now */
176
		ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
177 178 179 180 181 182 183 184
		if (ret < 0)
			return ret;
		ret = 0;
	} else {
		/*
		 * The ordered extent has reserved qgroup space, release now
		 * and pass the reserved number for qgroup_record to free.
		 */
185
		ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes);
186 187 188
		if (ret < 0)
			return ret;
	}
189
	entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
C
Chris Mason 已提交
190 191 192
	if (!entry)
		return -ENOMEM;

193
	entry->file_offset = file_offset;
194 195 196 197
	entry->disk_bytenr = disk_bytenr;
	entry->num_bytes = num_bytes;
	entry->disk_num_bytes = disk_num_bytes;
	entry->bytes_left = num_bytes;
198
	entry->inode = igrab(&inode->vfs_inode);
199
	entry->compress_type = compress_type;
200
	entry->truncated_len = (u64)-1;
201
	entry->qgroup_rsv = ret;
202 203 204 205 206 207

	ASSERT(type == BTRFS_ORDERED_REGULAR ||
	       type == BTRFS_ORDERED_NOCOW ||
	       type == BTRFS_ORDERED_PREALLOC ||
	       type == BTRFS_ORDERED_COMPRESSED);
	set_bit(type, &entry->flags);
208

209 210 211 212
	percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes,
				 fs_info->delalloc_batch);

	if (dio)
213 214
		set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);

215
	/* one ref for the tree */
216
	refcount_set(&entry->refs, 1);
217 218
	init_waitqueue_head(&entry->wait);
	INIT_LIST_HEAD(&entry->list);
219
	INIT_LIST_HEAD(&entry->log_list);
220
	INIT_LIST_HEAD(&entry->root_extent_list);
221 222
	INIT_LIST_HEAD(&entry->work_list);
	init_completion(&entry->completion);
C
Chris Mason 已提交
223

224
	trace_btrfs_ordered_extent_add(inode, entry);
225

226
	spin_lock_irq(&tree->lock);
227 228
	node = tree_insert(&tree->tree, file_offset,
			   &entry->rb_node);
229
	if (node)
230 231 232
		btrfs_panic(fs_info, -EEXIST,
				"inconsistency in ordered tree at offset %llu",
				file_offset);
233
	spin_unlock_irq(&tree->lock);
C
Chris Mason 已提交
234

235
	spin_lock(&root->ordered_extent_lock);
236
	list_add_tail(&entry->root_extent_list,
237 238 239
		      &root->ordered_extents);
	root->nr_ordered_extents++;
	if (root->nr_ordered_extents == 1) {
240
		spin_lock(&fs_info->ordered_root_lock);
241
		BUG_ON(!list_empty(&root->ordered_root));
242 243
		list_add_tail(&root->ordered_root, &fs_info->ordered_roots);
		spin_unlock(&fs_info->ordered_root_lock);
244 245
	}
	spin_unlock(&root->ordered_extent_lock);
246

J
Josef Bacik 已提交
247 248 249 250 251
	/*
	 * We don't need the count_max_extents here, we can assume that all of
	 * that work has been done at higher layers, so this is truly the
	 * smallest the extent is going to get.
	 */
252 253 254
	spin_lock(&inode->lock);
	btrfs_mod_outstanding_extents(inode, 1);
	spin_unlock(&inode->lock);
J
Josef Bacik 已提交
255

C
Chris Mason 已提交
256 257 258
	return 0;
}

259
int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
260 261
			     u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
			     int type)
262
{
263 264 265
	ASSERT(type == BTRFS_ORDERED_REGULAR ||
	       type == BTRFS_ORDERED_NOCOW ||
	       type == BTRFS_ORDERED_PREALLOC);
266
	return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
267
					  num_bytes, disk_num_bytes, type, 0,
268
					  BTRFS_COMPRESS_NONE);
269 270
}

271
int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset,
272 273
				 u64 disk_bytenr, u64 num_bytes,
				 u64 disk_num_bytes, int type)
274
{
275 276 277
	ASSERT(type == BTRFS_ORDERED_REGULAR ||
	       type == BTRFS_ORDERED_NOCOW ||
	       type == BTRFS_ORDERED_PREALLOC);
278
	return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
279
					  num_bytes, disk_num_bytes, type, 1,
280 281 282
					  BTRFS_COMPRESS_NONE);
}

283
int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset,
284
				      u64 disk_bytenr, u64 num_bytes,
285
				      u64 disk_num_bytes, int compress_type)
286
{
287
	ASSERT(compress_type != BTRFS_COMPRESS_NONE);
288
	return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
289 290
					  num_bytes, disk_num_bytes,
					  BTRFS_ORDERED_COMPRESSED, 0,
291
					  compress_type);
292 293
}

294 295
/*
 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
296 297
 * when an ordered extent is finished.  If the list covers more than one
 * ordered extent, it is split across multiples.
298
 */
299
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
300
			   struct btrfs_ordered_sum *sum)
C
Chris Mason 已提交
301
{
302
	struct btrfs_ordered_inode_tree *tree;
C
Chris Mason 已提交
303

304
	tree = &BTRFS_I(entry->inode)->ordered_tree;
305
	spin_lock_irq(&tree->lock);
306
	list_add_tail(&sum->list, &entry->list);
307
	spin_unlock_irq(&tree->lock);
C
Chris Mason 已提交
308 309
}

310
/*
311 312
 * Finish IO for one ordered extent across a given range.  The range can
 * contain several ordered extents.
313
 *
314 315 316 317 318 319
 * @found_ret:	 Return the finished ordered extent
 * @file_offset: File offset for the finished IO
 * 		 Will also be updated to one byte past the range that is
 * 		 recordered as finished. This allows caller to walk forward.
 * @io_size:	 Length of the finish IO range
 * @uptodate:	 If the IO finished without problem
320
 *
321 322 323 324 325 326 327
 * Return true if any ordered extent is finished in the range, and update
 * @found_ret and @file_offset.
 * Return false otherwise.
 *
 * NOTE: Although The range can cross multiple ordered extents, only one
 * ordered extent will be updated during one call. The caller is responsible to
 * iterate all ordered extents in the range.
328
 */
329 330
bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
				   struct btrfs_ordered_extent **finished_ret,
331
				   u64 *file_offset, u64 io_size, int uptodate)
332
{
333 334
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
335 336
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;
337
	bool finished = false;
338
	unsigned long flags;
339 340 341 342
	u64 dec_end;
	u64 dec_start;
	u64 to_dec;

343
	spin_lock_irqsave(&tree->lock, flags);
344
	node = tree_search(tree, *file_offset);
345
	if (!node)
346 347 348
		goto out;

	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
349
	if (!offset_in_entry(entry, *file_offset))
350 351 352
		goto out;

	dec_start = max(*file_offset, entry->file_offset);
353 354
	dec_end = min(*file_offset + io_size,
		      entry->file_offset + entry->num_bytes);
355 356
	*file_offset = dec_end;
	if (dec_start > dec_end) {
357 358
		btrfs_crit(fs_info, "bad ordering dec_start %llu end %llu",
			   dec_start, dec_end);
359 360 361
	}
	to_dec = dec_end - dec_start;
	if (to_dec > entry->bytes_left) {
362 363 364
		btrfs_crit(fs_info,
			   "bad ordered accounting left %llu size %llu",
			   entry->bytes_left, to_dec);
365 366
	}
	entry->bytes_left -= to_dec;
367 368 369
	if (!uptodate)
		set_bit(BTRFS_ORDERED_IOERR, &entry->flags);

370
	if (entry->bytes_left == 0) {
371 372 373 374 375
		/*
		 * Ensure only one caller can set the flag and finished_ret
		 * accordingly
		 */
		finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
376 377
		/* test_and_set_bit implies a barrier */
		cond_wake_up_nomb(&entry->wait);
378
	}
379
out:
380 381
	if (finished && finished_ret && entry) {
		*finished_ret = entry;
382
		refcount_inc(&entry->refs);
383
	}
384
	spin_unlock_irqrestore(&tree->lock, flags);
385
	return finished;
386 387
}

388
/*
389 390 391 392 393 394 395 396 397
 * Finish IO for one ordered extent across a given range.  The range can only
 * contain one ordered extent.
 *
 * @cached:	 The cached ordered extent. If not NULL, we can skip the tree
 *               search and use the ordered extent directly.
 * 		 Will be also used to store the finished ordered extent.
 * @file_offset: File offset for the finished IO
 * @io_size:	 Length of the finish IO range
 * @uptodate:	 If the IO finishes without problem
398
 *
399 400 401 402 403 404
 * Return true if the ordered extent is finished in the range, and update
 * @cached.
 * Return false otherwise.
 *
 * NOTE: The range can NOT cross multiple ordered extents.
 * Thus caller should ensure the range doesn't cross ordered extents.
405
 */
406 407 408
bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
				    struct btrfs_ordered_extent **cached,
				    u64 file_offset, u64 io_size, int uptodate)
C
Chris Mason 已提交
409
{
410
	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
C
Chris Mason 已提交
411
	struct rb_node *node;
412
	struct btrfs_ordered_extent *entry = NULL;
413
	unsigned long flags;
414
	bool finished = false;
415

416 417 418 419 420 421
	spin_lock_irqsave(&tree->lock, flags);
	if (cached && *cached) {
		entry = *cached;
		goto have_entry;
	}

422
	node = tree_search(tree, file_offset);
423
	if (!node)
424
		goto out;
C
Chris Mason 已提交
425

426
	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
427
have_entry:
428
	if (!offset_in_entry(entry, file_offset))
429 430
		goto out;

431
	if (io_size > entry->bytes_left)
432
		btrfs_crit(inode->root->fs_info,
433
			   "bad ordered accounting left %llu size %llu",
434
		       entry->bytes_left, io_size);
435

436
	entry->bytes_left -= io_size;
437 438 439
	if (!uptodate)
		set_bit(BTRFS_ORDERED_IOERR, &entry->flags);

440
	if (entry->bytes_left == 0) {
441 442 443 444 445
		/*
		 * Ensure only one caller can set the flag and finished_ret
		 * accordingly
		 */
		finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
446 447
		/* test_and_set_bit implies a barrier */
		cond_wake_up_nomb(&entry->wait);
448
	}
449
out:
450
	if (finished && cached && entry) {
451
		*cached = entry;
452
		refcount_inc(&entry->refs);
453
	}
454
	spin_unlock_irqrestore(&tree->lock, flags);
455
	return finished;
456
}
C
Chris Mason 已提交
457

458 459 460 461
/*
 * used to drop a reference on an ordered extent.  This will free
 * the extent if the last reference is dropped
 */
462
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
463
{
464 465 466
	struct list_head *cur;
	struct btrfs_ordered_sum *sum;

467
	trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry);
468

469
	if (refcount_dec_and_test(&entry->refs)) {
470
		ASSERT(list_empty(&entry->root_extent_list));
471
		ASSERT(list_empty(&entry->log_list));
472
		ASSERT(RB_EMPTY_NODE(&entry->rb_node));
473 474
		if (entry->inode)
			btrfs_add_delayed_iput(entry->inode);
C
Chris Mason 已提交
475
		while (!list_empty(&entry->list)) {
476 477 478
			cur = entry->list.next;
			sum = list_entry(cur, struct btrfs_ordered_sum, list);
			list_del(&sum->list);
479
			kvfree(sum);
480
		}
481
		kmem_cache_free(btrfs_ordered_extent_cache, entry);
482
	}
C
Chris Mason 已提交
483
}
484

485 486
/*
 * remove an ordered extent from the tree.  No references are dropped
487
 * and waiters are woken up.
488
 */
489
void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
490
				 struct btrfs_ordered_extent *entry)
491
{
492
	struct btrfs_ordered_inode_tree *tree;
J
Josef Bacik 已提交
493
	struct btrfs_root *root = btrfs_inode->root;
494
	struct btrfs_fs_info *fs_info = root->fs_info;
495
	struct rb_node *node;
496
	bool pending;
497

J
Josef Bacik 已提交
498 499 500 501 502
	/* This is paired with btrfs_add_ordered_extent. */
	spin_lock(&btrfs_inode->lock);
	btrfs_mod_outstanding_extents(btrfs_inode, -1);
	spin_unlock(&btrfs_inode->lock);
	if (root != fs_info->tree_root)
503 504
		btrfs_delalloc_release_metadata(btrfs_inode, entry->num_bytes,
						false);
J
Josef Bacik 已提交
505

506 507
	percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
				 fs_info->delalloc_batch);
J
Josef Bacik 已提交
508

J
Josef Bacik 已提交
509
	tree = &btrfs_inode->ordered_tree;
510
	spin_lock_irq(&tree->lock);
511
	node = &entry->rb_node;
512
	rb_erase(node, &tree->tree);
513
	RB_CLEAR_NODE(node);
514 515
	if (tree->last == node)
		tree->last = NULL;
516
	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
517
	pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
518
	spin_unlock_irq(&tree->lock);
519

520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546
	/*
	 * The current running transaction is waiting on us, we need to let it
	 * know that we're complete and wake it up.
	 */
	if (pending) {
		struct btrfs_transaction *trans;

		/*
		 * The checks for trans are just a formality, it should be set,
		 * but if it isn't we don't want to deref/assert under the spin
		 * lock, so be nice and check if trans is set, but ASSERT() so
		 * if it isn't set a developer will notice.
		 */
		spin_lock(&fs_info->trans_lock);
		trans = fs_info->running_transaction;
		if (trans)
			refcount_inc(&trans->use_count);
		spin_unlock(&fs_info->trans_lock);

		ASSERT(trans);
		if (trans) {
			if (atomic_dec_and_test(&trans->pending_ordered))
				wake_up(&trans->pending_wait);
			btrfs_put_transaction(trans);
		}
	}

547
	spin_lock(&root->ordered_extent_lock);
548
	list_del_init(&entry->root_extent_list);
549
	root->nr_ordered_extents--;
550

551
	trace_btrfs_ordered_extent_remove(btrfs_inode, entry);
552

553
	if (!root->nr_ordered_extents) {
554
		spin_lock(&fs_info->ordered_root_lock);
555 556
		BUG_ON(list_empty(&root->ordered_root));
		list_del_init(&root->ordered_root);
557
		spin_unlock(&fs_info->ordered_root_lock);
558 559
	}
	spin_unlock(&root->ordered_extent_lock);
560
	wake_up(&entry->wait);
561 562
}

563
static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
564 565 566 567
{
	struct btrfs_ordered_extent *ordered;

	ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
568
	btrfs_start_ordered_extent(ordered, 1);
569 570 571
	complete(&ordered->completion);
}

C
Chris Mason 已提交
572 573 574 575
/*
 * wait for all the ordered extents in a root.  This is done when balancing
 * space between drives.
 */
576
u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
577
			       const u64 range_start, const u64 range_len)
578
{
579
	struct btrfs_fs_info *fs_info = root->fs_info;
580 581 582
	LIST_HEAD(splice);
	LIST_HEAD(skipped);
	LIST_HEAD(works);
583
	struct btrfs_ordered_extent *ordered, *next;
584
	u64 count = 0;
585
	const u64 range_end = range_start + range_len;
586

587
	mutex_lock(&root->ordered_extent_mutex);
588 589
	spin_lock(&root->ordered_extent_lock);
	list_splice_init(&root->ordered_extents, &splice);
590
	while (!list_empty(&splice) && nr) {
591 592
		ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
					   root_extent_list);
593

594 595
		if (range_end <= ordered->disk_bytenr ||
		    ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) {
596 597 598 599 600
			list_move_tail(&ordered->root_extent_list, &skipped);
			cond_resched_lock(&root->ordered_extent_lock);
			continue;
		}

601 602
		list_move_tail(&ordered->root_extent_list,
			       &root->ordered_extents);
603
		refcount_inc(&ordered->refs);
604
		spin_unlock(&root->ordered_extent_lock);
605

606 607
		btrfs_init_work(&ordered->flush_work,
				btrfs_run_ordered_extent_work, NULL, NULL);
608
		list_add_tail(&ordered->work_list, &works);
609
		btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work);
610

611
		cond_resched();
612
		spin_lock(&root->ordered_extent_lock);
613
		if (nr != U64_MAX)
614 615
			nr--;
		count++;
616
	}
617
	list_splice_tail(&skipped, &root->ordered_extents);
618
	list_splice_tail(&splice, &root->ordered_extents);
619
	spin_unlock(&root->ordered_extent_lock);
620 621 622 623 624 625 626

	list_for_each_entry_safe(ordered, next, &works, work_list) {
		list_del_init(&ordered->work_list);
		wait_for_completion(&ordered->completion);
		btrfs_put_ordered_extent(ordered);
		cond_resched();
	}
627
	mutex_unlock(&root->ordered_extent_mutex);
628 629

	return count;
630 631
}

632
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
633
			     const u64 range_start, const u64 range_len)
634 635 636
{
	struct btrfs_root *root;
	struct list_head splice;
637
	u64 done;
638 639 640

	INIT_LIST_HEAD(&splice);

641
	mutex_lock(&fs_info->ordered_operations_mutex);
642 643
	spin_lock(&fs_info->ordered_root_lock);
	list_splice_init(&fs_info->ordered_roots, &splice);
644
	while (!list_empty(&splice) && nr) {
645 646
		root = list_first_entry(&splice, struct btrfs_root,
					ordered_root);
647
		root = btrfs_grab_root(root);
648 649 650 651 652
		BUG_ON(!root);
		list_move_tail(&root->ordered_root,
			       &fs_info->ordered_roots);
		spin_unlock(&fs_info->ordered_root_lock);

653 654
		done = btrfs_wait_ordered_extents(root, nr,
						  range_start, range_len);
655
		btrfs_put_root(root);
656 657

		spin_lock(&fs_info->ordered_root_lock);
658
		if (nr != U64_MAX) {
659 660
			nr -= done;
		}
661
	}
662
	list_splice_tail(&splice, &fs_info->ordered_roots);
663
	spin_unlock(&fs_info->ordered_root_lock);
664
	mutex_unlock(&fs_info->ordered_operations_mutex);
665 666
}

667 668 669 670 671 672 673
/*
 * Used to start IO or wait for a given ordered extent to finish.
 *
 * If wait is one, this effectively waits on page writeback for all the pages
 * in the extent, and it waits on the io completion code to insert
 * metadata into the btree corresponding to the extent
 */
674
void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
675 676
{
	u64 start = entry->file_offset;
677
	u64 end = start + entry->num_bytes - 1;
678
	struct btrfs_inode *inode = BTRFS_I(entry->inode);
679

680
	trace_btrfs_ordered_extent_start(inode, entry);
681

682 683 684
	/*
	 * pages in the range can be dirty, clean or writeback.  We
	 * start IO on any dirty ones so the wait doesn't stall waiting
685
	 * for the flusher thread to find them
686
	 */
687
	if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
688
		filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
C
Chris Mason 已提交
689
	if (wait) {
690 691
		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
						 &entry->flags));
C
Chris Mason 已提交
692
	}
693
}
694

695 696 697
/*
 * Used to wait on ordered extents across a large range of bytes.
 */
698
int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
699
{
700
	int ret = 0;
701
	int ret_wb = 0;
702
	u64 end;
703
	u64 orig_end;
704
	struct btrfs_ordered_extent *ordered;
705 706

	if (start + len < start) {
707
		orig_end = INT_LIMIT(loff_t);
708 709
	} else {
		orig_end = start + len - 1;
710 711
		if (orig_end > INT_LIMIT(loff_t))
			orig_end = INT_LIMIT(loff_t);
712
	}
713

714 715 716
	/* start IO across the range first to instantiate any delalloc
	 * extents
	 */
717
	ret = btrfs_fdatawrite_range(inode, start, orig_end);
718 719
	if (ret)
		return ret;
720

721 722 723 724 725 726 727 728
	/*
	 * If we have a writeback error don't return immediately. Wait first
	 * for any ordered extents that haven't completed yet. This is to make
	 * sure no one can dirty the same page ranges and call writepages()
	 * before the ordered extents complete - to avoid failures (-EEXIST)
	 * when adding the new ordered extents to the ordered tree.
	 */
	ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
729

730
	end = orig_end;
C
Chris Mason 已提交
731
	while (1) {
732
		ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end);
C
Chris Mason 已提交
733
		if (!ordered)
734
			break;
735
		if (ordered->file_offset > orig_end) {
736 737 738
			btrfs_put_ordered_extent(ordered);
			break;
		}
739
		if (ordered->file_offset + ordered->num_bytes <= start) {
740 741 742
			btrfs_put_ordered_extent(ordered);
			break;
		}
743
		btrfs_start_ordered_extent(ordered, 1);
744
		end = ordered->file_offset;
745 746 747 748 749
		/*
		 * If the ordered extent had an error save the error but don't
		 * exit without waiting first for all other ordered extents in
		 * the range to complete.
		 */
750 751
		if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
			ret = -EIO;
752
		btrfs_put_ordered_extent(ordered);
753
		if (end == 0 || end == start)
754 755 756
			break;
		end--;
	}
757
	return ret_wb ? ret_wb : ret;
758 759
}

760 761 762 763
/*
 * find an ordered extent corresponding to file_offset.  return NULL if
 * nothing is found, otherwise take a reference on the extent and return it
 */
764
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
765 766 767 768 769 770
							 u64 file_offset)
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;

771
	tree = &inode->ordered_tree;
772
	spin_lock_irq(&tree->lock);
773 774 775 776 777 778 779 780
	node = tree_search(tree, file_offset);
	if (!node)
		goto out;

	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
	if (!offset_in_entry(entry, file_offset))
		entry = NULL;
	if (entry)
781
		refcount_inc(&entry->refs);
782
out:
783
	spin_unlock_irq(&tree->lock);
784 785 786
	return entry;
}

787 788 789
/* Since the DIO code tries to lock a wide area we need to look for any ordered
 * extents that exist in the range, rather than just the start of the range.
 */
790 791
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
		struct btrfs_inode *inode, u64 file_offset, u64 len)
792 793 794 795 796
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;

797
	tree = &inode->ordered_tree;
798
	spin_lock_irq(&tree->lock);
799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821
	node = tree_search(tree, file_offset);
	if (!node) {
		node = tree_search(tree, file_offset + len);
		if (!node)
			goto out;
	}

	while (1) {
		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
		if (range_overlaps(entry, file_offset, len))
			break;

		if (entry->file_offset >= file_offset + len) {
			entry = NULL;
			break;
		}
		entry = NULL;
		node = rb_next(node);
		if (!node)
			break;
	}
out:
	if (entry)
822
		refcount_inc(&entry->refs);
823
	spin_unlock_irq(&tree->lock);
824 825 826
	return entry;
}

827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854
/*
 * Adds all ordered extents to the given list. The list ends up sorted by the
 * file_offset of the ordered extents.
 */
void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
					   struct list_head *list)
{
	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
	struct rb_node *n;

	ASSERT(inode_is_locked(&inode->vfs_inode));

	spin_lock_irq(&tree->lock);
	for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
		struct btrfs_ordered_extent *ordered;

		ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);

		if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
			continue;

		ASSERT(list_empty(&ordered->log_list));
		list_add_tail(&ordered->log_list, list);
		refcount_inc(&ordered->refs);
	}
	spin_unlock_irq(&tree->lock);
}

855 856 857 858
/*
 * lookup and return any extent before 'file_offset'.  NULL is returned
 * if none is found
 */
859
struct btrfs_ordered_extent *
860
btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
861 862 863 864 865
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;

866
	tree = &inode->ordered_tree;
867
	spin_lock_irq(&tree->lock);
868 869 870 871 872
	node = tree_search(tree, file_offset);
	if (!node)
		goto out;

	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
873
	refcount_inc(&entry->refs);
874
out:
875
	spin_unlock_irq(&tree->lock);
876
	return entry;
877
}
878

879 880 881 882 883 884 885 886 887 888 889 890 891
/*
 * btrfs_flush_ordered_range - Lock the passed range and ensures all pending
 * ordered extents in it are run to completion.
 *
 * @inode:        Inode whose ordered tree is to be searched
 * @start:        Beginning of range to flush
 * @end:          Last byte of range to lock
 * @cached_state: If passed, will return the extent state responsible for the
 * locked range. It's the caller's responsibility to free the cached state.
 *
 * This function always returns with the given range locked, ensuring after it's
 * called no order extent can be pending.
 */
892
void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
893 894 895 896
					u64 end,
					struct extent_state **cached_state)
{
	struct btrfs_ordered_extent *ordered;
897 898
	struct extent_state *cache = NULL;
	struct extent_state **cachedp = &cache;
899 900

	if (cached_state)
901
		cachedp = cached_state;
902 903

	while (1) {
904
		lock_extent_bits(&inode->io_tree, start, end, cachedp);
905 906
		ordered = btrfs_lookup_ordered_range(inode, start,
						     end - start + 1);
907 908 909 910 911 912 913
		if (!ordered) {
			/*
			 * If no external cached_state has been passed then
			 * decrement the extra ref taken for cachedp since we
			 * aren't exposing it outside of this function
			 */
			if (!cached_state)
914
				refcount_dec(&cache->refs);
915
			break;
916
		}
917
		unlock_extent_cached(&inode->io_tree, start, end, cachedp);
918
		btrfs_start_ordered_extent(ordered, 1);
919 920 921 922
		btrfs_put_ordered_extent(ordered);
	}
}

923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000
static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos,
				u64 len)
{
	struct inode *inode = ordered->inode;
	u64 file_offset = ordered->file_offset + pos;
	u64 disk_bytenr = ordered->disk_bytenr + pos;
	u64 num_bytes = len;
	u64 disk_num_bytes = len;
	int type;
	unsigned long flags_masked = ordered->flags & ~(1 << BTRFS_ORDERED_DIRECT);
	int compress_type = ordered->compress_type;
	unsigned long weight;
	int ret;

	weight = hweight_long(flags_masked);
	WARN_ON_ONCE(weight > 1);
	if (!weight)
		type = 0;
	else
		type = __ffs(flags_masked);

	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered->flags)) {
		WARN_ON_ONCE(1);
		ret = btrfs_add_ordered_extent_compress(BTRFS_I(inode),
				file_offset, disk_bytenr, num_bytes,
				disk_num_bytes, compress_type);
	} else if (test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
		ret = btrfs_add_ordered_extent_dio(BTRFS_I(inode), file_offset,
				disk_bytenr, num_bytes, disk_num_bytes, type);
	} else {
		ret = btrfs_add_ordered_extent(BTRFS_I(inode), file_offset,
				disk_bytenr, num_bytes, disk_num_bytes, type);
	}

	return ret;
}

int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
				u64 post)
{
	struct inode *inode = ordered->inode;
	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
	struct rb_node *node;
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	int ret = 0;

	spin_lock_irq(&tree->lock);
	/* Remove from tree once */
	node = &ordered->rb_node;
	rb_erase(node, &tree->tree);
	RB_CLEAR_NODE(node);
	if (tree->last == node)
		tree->last = NULL;

	ordered->file_offset += pre;
	ordered->disk_bytenr += pre;
	ordered->num_bytes -= (pre + post);
	ordered->disk_num_bytes -= (pre + post);
	ordered->bytes_left -= (pre + post);

	/* Re-insert the node */
	node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node);
	if (node)
		btrfs_panic(fs_info, -EEXIST,
			"zoned: inconsistency in ordered tree at offset %llu",
			    ordered->file_offset);

	spin_unlock_irq(&tree->lock);

	if (pre)
		ret = clone_ordered_extent(ordered, 0, pre);
	if (post)
		ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes,
					   post);

	return ret;
}

1001 1002 1003 1004
int __init ordered_data_init(void)
{
	btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
				     sizeof(struct btrfs_ordered_extent), 0,
1005
				     SLAB_MEM_SPREAD,
1006 1007 1008
				     NULL);
	if (!btrfs_ordered_extent_cache)
		return -ENOMEM;
1009

1010 1011 1012
	return 0;
}

1013
void __cold ordered_data_exit(void)
1014
{
1015
	kmem_cache_destroy(btrfs_ordered_extent_cache);
1016
}