ordered-data.c 32.3 KB
Newer Older
C
Chris Mason 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

#include <linux/slab.h>
20
#include <linux/blkdev.h>
21 22
#include <linux/writeback.h>
#include <linux/pagevec.h>
C
Chris Mason 已提交
23 24 25
#include "ctree.h"
#include "transaction.h"
#include "btrfs_inode.h"
26
#include "extent_io.h"
27
#include "disk-io.h"
C
Chris Mason 已提交
28

29 30
static struct kmem_cache *btrfs_ordered_extent_cache;

31
static u64 entry_end(struct btrfs_ordered_extent *entry)
C
Chris Mason 已提交
32
{
33 34 35
	if (entry->file_offset + entry->len < entry->file_offset)
		return (u64)-1;
	return entry->file_offset + entry->len;
C
Chris Mason 已提交
36 37
}

C
Chris Mason 已提交
38 39 40
/* returns NULL if the insertion worked, or it returns the node it did find
 * in the tree
 */
41 42
static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
				   struct rb_node *node)
C
Chris Mason 已提交
43
{
C
Chris Mason 已提交
44 45
	struct rb_node **p = &root->rb_node;
	struct rb_node *parent = NULL;
46
	struct btrfs_ordered_extent *entry;
C
Chris Mason 已提交
47

C
Chris Mason 已提交
48
	while (*p) {
C
Chris Mason 已提交
49
		parent = *p;
50
		entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
C
Chris Mason 已提交
51

52
		if (file_offset < entry->file_offset)
C
Chris Mason 已提交
53
			p = &(*p)->rb_left;
54
		else if (file_offset >= entry_end(entry))
C
Chris Mason 已提交
55 56 57 58 59 60 61 62 63 64
			p = &(*p)->rb_right;
		else
			return parent;
	}

	rb_link_node(node, parent, p);
	rb_insert_color(node, root);
	return NULL;
}

65 66 67 68 69
static void ordered_data_tree_panic(struct inode *inode, int errno,
					       u64 offset)
{
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset "
70
		    "%llu", offset);
71 72
}

C
Chris Mason 已提交
73 74 75 76
/*
 * look for a given offset in the tree, and if it can't be found return the
 * first lesser offset
 */
77 78
static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
				     struct rb_node **prev_ret)
C
Chris Mason 已提交
79
{
C
Chris Mason 已提交
80
	struct rb_node *n = root->rb_node;
C
Chris Mason 已提交
81
	struct rb_node *prev = NULL;
82 83 84
	struct rb_node *test;
	struct btrfs_ordered_extent *entry;
	struct btrfs_ordered_extent *prev_entry = NULL;
C
Chris Mason 已提交
85

C
Chris Mason 已提交
86
	while (n) {
87
		entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
C
Chris Mason 已提交
88 89 90
		prev = n;
		prev_entry = entry;

91
		if (file_offset < entry->file_offset)
C
Chris Mason 已提交
92
			n = n->rb_left;
93
		else if (file_offset >= entry_end(entry))
C
Chris Mason 已提交
94 95 96 97 98 99 100
			n = n->rb_right;
		else
			return n;
	}
	if (!prev_ret)
		return NULL;

C
Chris Mason 已提交
101
	while (prev && file_offset >= entry_end(prev_entry)) {
102 103 104 105 106 107 108 109 110 111 112 113 114
		test = rb_next(prev);
		if (!test)
			break;
		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
				      rb_node);
		if (file_offset < entry_end(prev_entry))
			break;

		prev = test;
	}
	if (prev)
		prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
				      rb_node);
C
Chris Mason 已提交
115
	while (prev && file_offset < entry_end(prev_entry)) {
116 117 118 119 120 121
		test = rb_prev(prev);
		if (!test)
			break;
		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
				      rb_node);
		prev = test;
C
Chris Mason 已提交
122 123 124 125 126
	}
	*prev_ret = prev;
	return NULL;
}

C
Chris Mason 已提交
127 128 129
/*
 * helper to check if a given offset is inside a given entry
 */
130 131 132 133 134 135 136 137
static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
{
	if (file_offset < entry->file_offset ||
	    entry->file_offset + entry->len <= file_offset)
		return 0;
	return 1;
}

138 139 140 141 142 143 144 145 146
static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
			  u64 len)
{
	if (file_offset + len <= entry->file_offset ||
	    entry->file_offset + entry->len <= file_offset)
		return 0;
	return 1;
}

C
Chris Mason 已提交
147 148 149 150
/*
 * look find the first ordered struct that has this offset, otherwise
 * the first one less than this offset
 */
151 152
static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
					  u64 file_offset)
C
Chris Mason 已提交
153
{
154
	struct rb_root *root = &tree->tree;
155
	struct rb_node *prev = NULL;
C
Chris Mason 已提交
156
	struct rb_node *ret;
157 158 159 160 161 162 163 164 165
	struct btrfs_ordered_extent *entry;

	if (tree->last) {
		entry = rb_entry(tree->last, struct btrfs_ordered_extent,
				 rb_node);
		if (offset_in_entry(entry, file_offset))
			return tree->last;
	}
	ret = __tree_search(root, file_offset, &prev);
C
Chris Mason 已提交
166
	if (!ret)
167 168 169
		ret = prev;
	if (ret)
		tree->last = ret;
C
Chris Mason 已提交
170 171 172
	return ret;
}

173 174 175 176 177 178 179 180 181 182 183
/* allocate and add a new ordered_extent into the per-inode tree.
 * file_offset is the logical offset in the file
 *
 * start is the disk block number of an extent already reserved in the
 * extent allocation tree
 *
 * len is the length of the extent
 *
 * The tree is given a single reference on the ordered extent that was
 * inserted.
 */
184 185
static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
				      u64 start, u64 len, u64 disk_len,
186
				      int type, int dio, int compress_type)
C
Chris Mason 已提交
187
{
188
	struct btrfs_root *root = BTRFS_I(inode)->root;
C
Chris Mason 已提交
189
	struct btrfs_ordered_inode_tree *tree;
190 191
	struct rb_node *node;
	struct btrfs_ordered_extent *entry;
C
Chris Mason 已提交
192

193
	tree = &BTRFS_I(inode)->ordered_tree;
194
	entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
C
Chris Mason 已提交
195 196 197
	if (!entry)
		return -ENOMEM;

198 199 200
	entry->file_offset = file_offset;
	entry->start = start;
	entry->len = len;
201 202 203
	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) &&
	    !(type == BTRFS_ORDERED_NOCOW))
		entry->csum_bytes_left = disk_len;
C
Chris Mason 已提交
204
	entry->disk_len = disk_len;
205
	entry->bytes_left = len;
206
	entry->inode = igrab(inode);
207
	entry->compress_type = compress_type;
208
	entry->truncated_len = (u64)-1;
Y
Yan Zheng 已提交
209
	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
Y
Yan Zheng 已提交
210
		set_bit(type, &entry->flags);
211

212 213 214
	if (dio)
		set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);

215 216 217 218
	/* one ref for the tree */
	atomic_set(&entry->refs, 1);
	init_waitqueue_head(&entry->wait);
	INIT_LIST_HEAD(&entry->list);
219
	INIT_LIST_HEAD(&entry->root_extent_list);
220 221
	INIT_LIST_HEAD(&entry->work_list);
	init_completion(&entry->completion);
222
	INIT_LIST_HEAD(&entry->log_list);
C
Chris Mason 已提交
223

224 225
	trace_btrfs_ordered_extent_add(inode, entry);

226
	spin_lock_irq(&tree->lock);
227 228
	node = tree_insert(&tree->tree, file_offset,
			   &entry->rb_node);
229 230
	if (node)
		ordered_data_tree_panic(inode, -EEXIST, file_offset);
231
	spin_unlock_irq(&tree->lock);
C
Chris Mason 已提交
232

233
	spin_lock(&root->ordered_extent_lock);
234
	list_add_tail(&entry->root_extent_list,
235 236 237 238 239 240 241 242 243 244
		      &root->ordered_extents);
	root->nr_ordered_extents++;
	if (root->nr_ordered_extents == 1) {
		spin_lock(&root->fs_info->ordered_root_lock);
		BUG_ON(!list_empty(&root->ordered_root));
		list_add_tail(&root->ordered_root,
			      &root->fs_info->ordered_roots);
		spin_unlock(&root->fs_info->ordered_root_lock);
	}
	spin_unlock(&root->ordered_extent_lock);
245

C
Chris Mason 已提交
246 247 248
	return 0;
}

249 250 251 252
int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
			     u64 start, u64 len, u64 disk_len, int type)
{
	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
253 254
					  disk_len, type, 0,
					  BTRFS_COMPRESS_NONE);
255 256 257 258 259 260
}

int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
				 u64 start, u64 len, u64 disk_len, int type)
{
	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
261 262 263 264 265 266 267 268 269 270 271
					  disk_len, type, 1,
					  BTRFS_COMPRESS_NONE);
}

int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
				      u64 start, u64 len, u64 disk_len,
				      int type, int compress_type)
{
	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
					  disk_len, type, 0,
					  compress_type);
272 273
}

274 275
/*
 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
276 277
 * when an ordered extent is finished.  If the list covers more than one
 * ordered extent, it is split across multiples.
278
 */
279 280 281
void btrfs_add_ordered_sum(struct inode *inode,
			   struct btrfs_ordered_extent *entry,
			   struct btrfs_ordered_sum *sum)
C
Chris Mason 已提交
282
{
283
	struct btrfs_ordered_inode_tree *tree;
C
Chris Mason 已提交
284

285
	tree = &BTRFS_I(inode)->ordered_tree;
286
	spin_lock_irq(&tree->lock);
287
	list_add_tail(&sum->list, &entry->list);
288 289 290 291
	WARN_ON(entry->csum_bytes_left < sum->len);
	entry->csum_bytes_left -= sum->len;
	if (entry->csum_bytes_left == 0)
		wake_up(&entry->wait);
292
	spin_unlock_irq(&tree->lock);
C
Chris Mason 已提交
293 294
}

295 296 297 298 299 300 301 302 303 304 305 306 307 308
/*
 * this is used to account for finished IO across a given range
 * of the file.  The IO may span ordered extents.  If
 * a given ordered_extent is completely done, 1 is returned, otherwise
 * 0.
 *
 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
 * to make sure this function only returns 1 once for a given ordered extent.
 *
 * file_offset is updated to one byte past the range that is recorded as
 * complete.  This allows you to walk forward in the file.
 */
int btrfs_dec_test_first_ordered_pending(struct inode *inode,
				   struct btrfs_ordered_extent **cached,
309
				   u64 *file_offset, u64 io_size, int uptodate)
310 311 312 313 314
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;
	int ret;
315
	unsigned long flags;
316 317 318 319 320
	u64 dec_end;
	u64 dec_start;
	u64 to_dec;

	tree = &BTRFS_I(inode)->ordered_tree;
321
	spin_lock_irqsave(&tree->lock, flags);
322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338
	node = tree_search(tree, *file_offset);
	if (!node) {
		ret = 1;
		goto out;
	}

	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
	if (!offset_in_entry(entry, *file_offset)) {
		ret = 1;
		goto out;
	}

	dec_start = max(*file_offset, entry->file_offset);
	dec_end = min(*file_offset + io_size, entry->file_offset +
		      entry->len);
	*file_offset = dec_end;
	if (dec_start > dec_end) {
339 340
		btrfs_crit(BTRFS_I(inode)->root->fs_info,
			"bad ordering dec_start %llu end %llu", dec_start, dec_end);
341 342 343
	}
	to_dec = dec_end - dec_start;
	if (to_dec > entry->bytes_left) {
344 345 346
		btrfs_crit(BTRFS_I(inode)->root->fs_info,
			"bad ordered accounting left %llu size %llu",
			entry->bytes_left, to_dec);
347 348
	}
	entry->bytes_left -= to_dec;
349 350 351
	if (!uptodate)
		set_bit(BTRFS_ORDERED_IOERR, &entry->flags);

352
	if (entry->bytes_left == 0) {
353
		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
354 355 356
		if (waitqueue_active(&entry->wait))
			wake_up(&entry->wait);
	} else {
357
		ret = 1;
358
	}
359 360 361 362 363
out:
	if (!ret && cached && entry) {
		*cached = entry;
		atomic_inc(&entry->refs);
	}
364
	spin_unlock_irqrestore(&tree->lock, flags);
365 366 367
	return ret == 0;
}

368 369 370 371 372 373 374 375 376
/*
 * this is used to account for finished IO across a given range
 * of the file.  The IO should not span ordered extents.  If
 * a given ordered_extent is completely done, 1 is returned, otherwise
 * 0.
 *
 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
 * to make sure this function only returns 1 once for a given ordered extent.
 */
377
int btrfs_dec_test_ordered_pending(struct inode *inode,
378
				   struct btrfs_ordered_extent **cached,
379
				   u64 file_offset, u64 io_size, int uptodate)
C
Chris Mason 已提交
380
{
381
	struct btrfs_ordered_inode_tree *tree;
C
Chris Mason 已提交
382
	struct rb_node *node;
383
	struct btrfs_ordered_extent *entry = NULL;
384
	unsigned long flags;
385 386 387
	int ret;

	tree = &BTRFS_I(inode)->ordered_tree;
388 389 390 391 392 393
	spin_lock_irqsave(&tree->lock, flags);
	if (cached && *cached) {
		entry = *cached;
		goto have_entry;
	}

394
	node = tree_search(tree, file_offset);
C
Chris Mason 已提交
395
	if (!node) {
396 397
		ret = 1;
		goto out;
C
Chris Mason 已提交
398 399
	}

400
	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
401
have_entry:
402 403 404
	if (!offset_in_entry(entry, file_offset)) {
		ret = 1;
		goto out;
C
Chris Mason 已提交
405
	}
406

407
	if (io_size > entry->bytes_left) {
408 409
		btrfs_crit(BTRFS_I(inode)->root->fs_info,
			   "bad ordered accounting left %llu size %llu",
410
		       entry->bytes_left, io_size);
411 412
	}
	entry->bytes_left -= io_size;
413 414 415
	if (!uptodate)
		set_bit(BTRFS_ORDERED_IOERR, &entry->flags);

416
	if (entry->bytes_left == 0) {
417
		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
418 419 420
		if (waitqueue_active(&entry->wait))
			wake_up(&entry->wait);
	} else {
421
		ret = 1;
422
	}
423
out:
424 425 426 427
	if (!ret && cached && entry) {
		*cached = entry;
		atomic_inc(&entry->refs);
	}
428
	spin_unlock_irqrestore(&tree->lock, flags);
429 430
	return ret == 0;
}
C
Chris Mason 已提交
431

432
/* Needs to either be called under a log transaction or the log_mutex */
433 434
void btrfs_get_logged_extents(struct inode *inode,
			      struct list_head *logged_list)
435 436 437 438 439 440 441 442 443
{
	struct btrfs_ordered_inode_tree *tree;
	struct btrfs_ordered_extent *ordered;
	struct rb_node *n;

	tree = &BTRFS_I(inode)->ordered_tree;
	spin_lock_irq(&tree->lock);
	for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
		ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
444 445 446 447
		if (!list_empty(&ordered->log_list))
			continue;
		list_add_tail(&ordered->log_list, logged_list);
		atomic_inc(&ordered->refs);
448 449 450 451
	}
	spin_unlock_irq(&tree->lock);
}

452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474
void btrfs_put_logged_extents(struct list_head *logged_list)
{
	struct btrfs_ordered_extent *ordered;

	while (!list_empty(logged_list)) {
		ordered = list_first_entry(logged_list,
					   struct btrfs_ordered_extent,
					   log_list);
		list_del_init(&ordered->log_list);
		btrfs_put_ordered_extent(ordered);
	}
}

void btrfs_submit_logged_extents(struct list_head *logged_list,
				 struct btrfs_root *log)
{
	int index = log->log_transid % 2;

	spin_lock_irq(&log->log_extents_lock[index]);
	list_splice_tail(logged_list, &log->logged_list[index]);
	spin_unlock_irq(&log->log_extents_lock[index]);
}

475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512
void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
{
	struct btrfs_ordered_extent *ordered;
	int index = transid % 2;

	spin_lock_irq(&log->log_extents_lock[index]);
	while (!list_empty(&log->logged_list[index])) {
		ordered = list_first_entry(&log->logged_list[index],
					   struct btrfs_ordered_extent,
					   log_list);
		list_del_init(&ordered->log_list);
		spin_unlock_irq(&log->log_extents_lock[index]);
		wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
						   &ordered->flags));
		btrfs_put_ordered_extent(ordered);
		spin_lock_irq(&log->log_extents_lock[index]);
	}
	spin_unlock_irq(&log->log_extents_lock[index]);
}

void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid)
{
	struct btrfs_ordered_extent *ordered;
	int index = transid % 2;

	spin_lock_irq(&log->log_extents_lock[index]);
	while (!list_empty(&log->logged_list[index])) {
		ordered = list_first_entry(&log->logged_list[index],
					   struct btrfs_ordered_extent,
					   log_list);
		list_del_init(&ordered->log_list);
		spin_unlock_irq(&log->log_extents_lock[index]);
		btrfs_put_ordered_extent(ordered);
		spin_lock_irq(&log->log_extents_lock[index]);
	}
	spin_unlock_irq(&log->log_extents_lock[index]);
}

513 514 515 516
/*
 * used to drop a reference on an ordered extent.  This will free
 * the extent if the last reference is dropped
 */
517
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
518
{
519 520 521
	struct list_head *cur;
	struct btrfs_ordered_sum *sum;

522 523
	trace_btrfs_ordered_extent_put(entry->inode, entry);

524
	if (atomic_dec_and_test(&entry->refs)) {
525 526
		if (entry->inode)
			btrfs_add_delayed_iput(entry->inode);
C
Chris Mason 已提交
527
		while (!list_empty(&entry->list)) {
528 529 530 531 532
			cur = entry->list.next;
			sum = list_entry(cur, struct btrfs_ordered_sum, list);
			list_del(&sum->list);
			kfree(sum);
		}
533
		kmem_cache_free(btrfs_ordered_extent_cache, entry);
534
	}
C
Chris Mason 已提交
535
}
536

537 538
/*
 * remove an ordered extent from the tree.  No references are dropped
539
 * and waiters are woken up.
540
 */
541 542
void btrfs_remove_ordered_extent(struct inode *inode,
				 struct btrfs_ordered_extent *entry)
543
{
544
	struct btrfs_ordered_inode_tree *tree;
545
	struct btrfs_root *root = BTRFS_I(inode)->root;
546 547
	struct rb_node *node;

548
	tree = &BTRFS_I(inode)->ordered_tree;
549
	spin_lock_irq(&tree->lock);
550
	node = &entry->rb_node;
551
	rb_erase(node, &tree->tree);
552 553
	if (tree->last == node)
		tree->last = NULL;
554
	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
555
	spin_unlock_irq(&tree->lock);
556

557
	spin_lock(&root->ordered_extent_lock);
558
	list_del_init(&entry->root_extent_list);
559
	root->nr_ordered_extents--;
560

561 562
	trace_btrfs_ordered_extent_remove(inode, entry);

563 564 565 566 567 568 569
	/*
	 * we have no more ordered extents for this inode and
	 * no dirty pages.  We can safely remove it from the
	 * list of ordered extents
	 */
	if (RB_EMPTY_ROOT(&tree->tree) &&
	    !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
570
		spin_lock(&root->fs_info->ordered_root_lock);
571
		list_del_init(&BTRFS_I(inode)->ordered_operations);
572
		spin_unlock(&root->fs_info->ordered_root_lock);
573
	}
574 575 576 577 578 579 580 581

	if (!root->nr_ordered_extents) {
		spin_lock(&root->fs_info->ordered_root_lock);
		BUG_ON(list_empty(&root->ordered_root));
		list_del_init(&root->ordered_root);
		spin_unlock(&root->fs_info->ordered_root_lock);
	}
	spin_unlock(&root->ordered_extent_lock);
582
	wake_up(&entry->wait);
583 584
}

585
static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
586 587 588 589 590 591 592 593
{
	struct btrfs_ordered_extent *ordered;

	ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
	btrfs_start_ordered_extent(ordered->inode, ordered, 1);
	complete(&ordered->completion);
}

C
Chris Mason 已提交
594 595 596 597
/*
 * wait for all the ordered extents in a root.  This is done when balancing
 * space between drives.
 */
598
int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
599
{
600 601
	struct list_head splice, works;
	struct btrfs_ordered_extent *ordered, *next;
602
	int count = 0;
603 604

	INIT_LIST_HEAD(&splice);
605
	INIT_LIST_HEAD(&works);
606

607
	mutex_lock(&root->ordered_extent_mutex);
608 609
	spin_lock(&root->ordered_extent_lock);
	list_splice_init(&root->ordered_extents, &splice);
610
	while (!list_empty(&splice) && nr) {
611 612 613 614 615 616
		ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
					   root_extent_list);
		list_move_tail(&ordered->root_extent_list,
			       &root->ordered_extents);
		atomic_inc(&ordered->refs);
		spin_unlock(&root->ordered_extent_lock);
617

618 619
		btrfs_init_work(&ordered->flush_work,
				btrfs_run_ordered_extent_work, NULL, NULL);
620
		list_add_tail(&ordered->work_list, &works);
621 622
		btrfs_queue_work(root->fs_info->flush_workers,
				 &ordered->flush_work);
623

624
		cond_resched();
625
		spin_lock(&root->ordered_extent_lock);
626 627 628
		if (nr != -1)
			nr--;
		count++;
629
	}
630
	list_splice_tail(&splice, &root->ordered_extents);
631
	spin_unlock(&root->ordered_extent_lock);
632 633 634 635 636 637 638

	list_for_each_entry_safe(ordered, next, &works, work_list) {
		list_del_init(&ordered->work_list);
		wait_for_completion(&ordered->completion);
		btrfs_put_ordered_extent(ordered);
		cond_resched();
	}
639
	mutex_unlock(&root->ordered_extent_mutex);
640 641

	return count;
642 643
}

644
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
645 646 647
{
	struct btrfs_root *root;
	struct list_head splice;
648
	int done;
649 650 651

	INIT_LIST_HEAD(&splice);

652
	mutex_lock(&fs_info->ordered_operations_mutex);
653 654
	spin_lock(&fs_info->ordered_root_lock);
	list_splice_init(&fs_info->ordered_roots, &splice);
655
	while (!list_empty(&splice) && nr) {
656 657 658 659 660 661 662 663
		root = list_first_entry(&splice, struct btrfs_root,
					ordered_root);
		root = btrfs_grab_fs_root(root);
		BUG_ON(!root);
		list_move_tail(&root->ordered_root,
			       &fs_info->ordered_roots);
		spin_unlock(&fs_info->ordered_root_lock);

664
		done = btrfs_wait_ordered_extents(root, nr);
665 666 667
		btrfs_put_fs_root(root);

		spin_lock(&fs_info->ordered_root_lock);
668 669 670 671
		if (nr != -1) {
			nr -= done;
			WARN_ON(nr < 0);
		}
672
	}
673
	list_splice_tail(&splice, &fs_info->ordered_roots);
674
	spin_unlock(&fs_info->ordered_root_lock);
675
	mutex_unlock(&fs_info->ordered_operations_mutex);
676 677
}

678 679 680 681 682 683 684 685 686 687
/*
 * this is used during transaction commit to write all the inodes
 * added to the ordered operation list.  These files must be fully on
 * disk before the transaction commits.
 *
 * we have two modes here, one is to just start the IO via filemap_flush
 * and the other is to wait for all the io.  When we wait, we have an
 * extra check to make sure the ordered operation list really is empty
 * before we return
 */
688 689
int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root, int wait)
690 691 692
{
	struct btrfs_inode *btrfs_inode;
	struct inode *inode;
693
	struct btrfs_transaction *cur_trans = trans->transaction;
694
	struct list_head splice;
695 696 697
	struct list_head works;
	struct btrfs_delalloc_work *work, *next;
	int ret = 0;
698 699

	INIT_LIST_HEAD(&splice);
700
	INIT_LIST_HEAD(&works);
701

702
	mutex_lock(&root->fs_info->ordered_extent_flush_mutex);
703
	spin_lock(&root->fs_info->ordered_root_lock);
704
	list_splice_init(&cur_trans->ordered_operations, &splice);
705 706 707 708 709 710 711 712 713 714 715
	while (!list_empty(&splice)) {
		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
				   ordered_operations);
		inode = &btrfs_inode->vfs_inode;

		list_del_init(&btrfs_inode->ordered_operations);

		/*
		 * the inode may be getting freed (in sys_unlink path).
		 */
		inode = igrab(inode);
716 717
		if (!inode)
			continue;
718 719 720

		if (!wait)
			list_add_tail(&BTRFS_I(inode)->ordered_operations,
721
				      &cur_trans->ordered_operations);
722
		spin_unlock(&root->fs_info->ordered_root_lock);
723

724 725
		work = btrfs_alloc_delalloc_work(inode, wait, 1);
		if (!work) {
726
			spin_lock(&root->fs_info->ordered_root_lock);
727 728 729 730
			if (list_empty(&BTRFS_I(inode)->ordered_operations))
				list_add_tail(&btrfs_inode->ordered_operations,
					      &splice);
			list_splice_tail(&splice,
731
					 &cur_trans->ordered_operations);
732
			spin_unlock(&root->fs_info->ordered_root_lock);
733 734
			ret = -ENOMEM;
			goto out;
735
		}
736
		list_add_tail(&work->list, &works);
737 738
		btrfs_queue_work(root->fs_info->flush_workers,
				 &work->work);
739 740

		cond_resched();
741
		spin_lock(&root->fs_info->ordered_root_lock);
742
	}
743
	spin_unlock(&root->fs_info->ordered_root_lock);
744 745 746 747 748
out:
	list_for_each_entry_safe(work, next, &works, list) {
		list_del_init(&work->list);
		btrfs_wait_and_free_delalloc_work(work);
	}
749
	mutex_unlock(&root->fs_info->ordered_extent_flush_mutex);
750
	return ret;
751 752
}

753 754 755 756 757 758 759 760 761 762
/*
 * Used to start IO or wait for a given ordered extent to finish.
 *
 * If wait is one, this effectively waits on page writeback for all the pages
 * in the extent, and it waits on the io completion code to insert
 * metadata into the btree corresponding to the extent
 */
void btrfs_start_ordered_extent(struct inode *inode,
				       struct btrfs_ordered_extent *entry,
				       int wait)
763 764 765
{
	u64 start = entry->file_offset;
	u64 end = start + entry->len - 1;
766

767 768
	trace_btrfs_ordered_extent_start(inode, entry);

769 770 771
	/*
	 * pages in the range can be dirty, clean or writeback.  We
	 * start IO on any dirty ones so the wait doesn't stall waiting
772
	 * for the flusher thread to find them
773
	 */
774 775
	if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
		filemap_fdatawrite_range(inode->i_mapping, start, end);
C
Chris Mason 已提交
776
	if (wait) {
777 778
		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
						 &entry->flags));
C
Chris Mason 已提交
779
	}
780
}
781

782 783 784
/*
 * Used to wait on ordered extents across a large range of bytes.
 */
785
int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
786
{
787
	int ret = 0;
788
	u64 end;
789
	u64 orig_end;
790
	struct btrfs_ordered_extent *ordered;
791 792

	if (start + len < start) {
793
		orig_end = INT_LIMIT(loff_t);
794 795
	} else {
		orig_end = start + len - 1;
796 797
		if (orig_end > INT_LIMIT(loff_t))
			orig_end = INT_LIMIT(loff_t);
798
	}
799

800 801 802
	/* start IO across the range first to instantiate any delalloc
	 * extents
	 */
803 804 805
	ret = filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
	if (ret)
		return ret;
806 807 808 809 810 811 812 813 814 815 816 817 818 819 820
	/*
	 * So with compression we will find and lock a dirty page and clear the
	 * first one as dirty, setup an async extent, and immediately return
	 * with the entire range locked but with nobody actually marked with
	 * writeback.  So we can't just filemap_write_and_wait_range() and
	 * expect it to work since it will just kick off a thread to do the
	 * actual work.  So we need to call filemap_fdatawrite_range _again_
	 * since it will wait on the page lock, which won't be unlocked until
	 * after the pages have been marked as writeback and so we're good to go
	 * from there.  We have to do this otherwise we'll miss the ordered
	 * extents and that results in badness.  Please Josef, do not think you
	 * know better and pull this out at some point in the future, it is
	 * right and you are wrong.
	 */
	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
821 822 823 824 825 826 827 828 829
		     &BTRFS_I(inode)->runtime_flags)) {
		ret = filemap_fdatawrite_range(inode->i_mapping, start,
					       orig_end);
		if (ret)
			return ret;
	}
	ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
	if (ret)
		return ret;
830

831
	end = orig_end;
C
Chris Mason 已提交
832
	while (1) {
833
		ordered = btrfs_lookup_first_ordered_extent(inode, end);
C
Chris Mason 已提交
834
		if (!ordered)
835
			break;
836
		if (ordered->file_offset > orig_end) {
837 838 839
			btrfs_put_ordered_extent(ordered);
			break;
		}
840
		if (ordered->file_offset + ordered->len <= start) {
841 842 843
			btrfs_put_ordered_extent(ordered);
			break;
		}
844
		btrfs_start_ordered_extent(inode, ordered, 1);
845
		end = ordered->file_offset;
846 847
		if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
			ret = -EIO;
848
		btrfs_put_ordered_extent(ordered);
849
		if (ret || end == 0 || end == start)
850 851 852
			break;
		end--;
	}
853
	return ret;
854 855
}

856 857 858 859
/*
 * find an ordered extent corresponding to file_offset.  return NULL if
 * nothing is found, otherwise take a reference on the extent and return it
 */
860 861 862 863 864 865 866 867
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
							 u64 file_offset)
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;

	tree = &BTRFS_I(inode)->ordered_tree;
868
	spin_lock_irq(&tree->lock);
869 870 871 872 873 874 875 876 877 878
	node = tree_search(tree, file_offset);
	if (!node)
		goto out;

	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
	if (!offset_in_entry(entry, file_offset))
		entry = NULL;
	if (entry)
		atomic_inc(&entry->refs);
out:
879
	spin_unlock_irq(&tree->lock);
880 881 882
	return entry;
}

883 884 885 886 887 888 889 890 891 892 893 894
/* Since the DIO code tries to lock a wide area we need to look for any ordered
 * extents that exist in the range, rather than just the start of the range.
 */
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
							u64 file_offset,
							u64 len)
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;

	tree = &BTRFS_I(inode)->ordered_tree;
895
	spin_lock_irq(&tree->lock);
896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919
	node = tree_search(tree, file_offset);
	if (!node) {
		node = tree_search(tree, file_offset + len);
		if (!node)
			goto out;
	}

	while (1) {
		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
		if (range_overlaps(entry, file_offset, len))
			break;

		if (entry->file_offset >= file_offset + len) {
			entry = NULL;
			break;
		}
		entry = NULL;
		node = rb_next(node);
		if (!node)
			break;
	}
out:
	if (entry)
		atomic_inc(&entry->refs);
920
	spin_unlock_irq(&tree->lock);
921 922 923
	return entry;
}

924 925 926 927
/*
 * lookup and return any extent before 'file_offset'.  NULL is returned
 * if none is found
 */
928
struct btrfs_ordered_extent *
C
Chris Mason 已提交
929
btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
930 931 932 933 934 935
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;

	tree = &BTRFS_I(inode)->ordered_tree;
936
	spin_lock_irq(&tree->lock);
937 938 939 940 941 942 943
	node = tree_search(tree, file_offset);
	if (!node)
		goto out;

	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
	atomic_inc(&entry->refs);
out:
944
	spin_unlock_irq(&tree->lock);
945
	return entry;
946
}
947

948 949 950 951
/*
 * After an extent is done, call this to conditionally update the on disk
 * i_size.  i_size is updated to cover any fully written part of the file.
 */
952
int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
953 954 955 956 957
				struct btrfs_ordered_extent *ordered)
{
	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
	u64 disk_i_size;
	u64 new_i_size;
958
	u64 i_size = i_size_read(inode);
959
	struct rb_node *node;
960
	struct rb_node *prev = NULL;
961
	struct btrfs_ordered_extent *test;
962 963
	int ret = 1;

964 965
	spin_lock_irq(&tree->lock);
	if (ordered) {
966
		offset = entry_end(ordered);
967 968 969 970 971
		if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags))
			offset = min(offset,
				     ordered->file_offset +
				     ordered->truncated_len);
	} else {
972
		offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
973
	}
974 975
	disk_i_size = BTRFS_I(inode)->disk_i_size;

976 977 978 979 980 981 982
	/* truncate file */
	if (disk_i_size > i_size) {
		BTRFS_I(inode)->disk_i_size = i_size;
		ret = 0;
		goto out;
	}

983 984 985 986
	/*
	 * if the disk i_size is already at the inode->i_size, or
	 * this ordered extent is inside the disk i_size, we're done
	 */
J
Josef Bacik 已提交
987 988 989 990 991 992 993 994 995
	if (disk_i_size == i_size)
		goto out;

	/*
	 * We still need to update disk_i_size if outstanding_isize is greater
	 * than disk_i_size.
	 */
	if (offset <= disk_i_size &&
	    (!ordered || ordered->outstanding_isize <= disk_i_size))
996 997 998 999 1000 1001 1002
		goto out;

	/*
	 * walk backward from this ordered extent to disk_i_size.
	 * if we find an ordered extent then we can't update disk i_size
	 * yet
	 */
1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017
	if (ordered) {
		node = rb_prev(&ordered->rb_node);
	} else {
		prev = tree_search(tree, offset);
		/*
		 * we insert file extents without involving ordered struct,
		 * so there should be no ordered struct cover this offset
		 */
		if (prev) {
			test = rb_entry(prev, struct btrfs_ordered_extent,
					rb_node);
			BUG_ON(offset_in_entry(test, offset));
		}
		node = prev;
	}
1018
	for (; node; node = rb_prev(node)) {
1019
		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
1020 1021 1022 1023

		/* We treat this entry as if it doesnt exist */
		if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
			continue;
1024 1025
		if (test->file_offset + test->len <= disk_i_size)
			break;
1026
		if (test->file_offset >= i_size)
1027
			break;
1028
		if (entry_end(test) > disk_i_size) {
1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040
			/*
			 * we don't update disk_i_size now, so record this
			 * undealt i_size. Or we will not know the real
			 * i_size.
			 */
			if (test->outstanding_isize < offset)
				test->outstanding_isize = offset;
			if (ordered &&
			    ordered->outstanding_isize >
			    test->outstanding_isize)
				test->outstanding_isize =
						ordered->outstanding_isize;
1041
			goto out;
1042
		}
1043
	}
1044
	new_i_size = min_t(u64, offset, i_size);
1045 1046

	/*
1047 1048
	 * Some ordered extents may completed before the current one, and
	 * we hold the real i_size in ->outstanding_isize.
1049
	 */
1050 1051
	if (ordered && ordered->outstanding_isize > new_i_size)
		new_i_size = min_t(u64, ordered->outstanding_isize, i_size);
1052
	BTRFS_I(inode)->disk_i_size = new_i_size;
1053
	ret = 0;
1054
out:
1055
	/*
1056 1057 1058 1059 1060
	 * We need to do this because we can't remove ordered extents until
	 * after the i_disk_size has been updated and then the inode has been
	 * updated to reflect the change, so we need to tell anybody who finds
	 * this ordered extent that we've already done all the real work, we
	 * just haven't completed all the other work.
1061 1062
	 */
	if (ordered)
1063 1064
		set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags);
	spin_unlock_irq(&tree->lock);
1065
	return ret;
1066
}
1067

1068 1069 1070 1071 1072
/*
 * search the ordered extents for one corresponding to 'offset' and
 * try to find a checksum.  This is used because we allow pages to
 * be reclaimed before their checksum is actually put into the btree
 */
1073
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
1074
			   u32 *sum, int len)
1075 1076 1077 1078
{
	struct btrfs_ordered_sum *ordered_sum;
	struct btrfs_ordered_extent *ordered;
	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
1079 1080 1081
	unsigned long num_sectors;
	unsigned long i;
	u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
1082
	int index = 0;
1083 1084 1085

	ordered = btrfs_lookup_ordered_extent(inode, offset);
	if (!ordered)
1086
		return 0;
1087

1088
	spin_lock_irq(&tree->lock);
Q
Qinghuang Feng 已提交
1089
	list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
1090 1091 1092 1093 1094 1095
		if (disk_bytenr >= ordered_sum->bytenr &&
		    disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
			i = (disk_bytenr - ordered_sum->bytenr) >>
			    inode->i_sb->s_blocksize_bits;
			num_sectors = ordered_sum->len >>
				      inode->i_sb->s_blocksize_bits;
1096 1097 1098 1099 1100 1101 1102 1103
			num_sectors = min_t(int, len - index, num_sectors - i);
			memcpy(sum + index, ordered_sum->sums + i,
			       num_sectors);

			index += (int)num_sectors;
			if (index == len)
				goto out;
			disk_bytenr += num_sectors * sectorsize;
1104 1105 1106
		}
	}
out:
1107
	spin_unlock_irq(&tree->lock);
1108
	btrfs_put_ordered_extent(ordered);
1109
	return index;
1110 1111
}

1112

1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124
/*
 * add a given inode to the list of inodes that must be fully on
 * disk before a transaction commit finishes.
 *
 * This basically gives us the ext3 style data=ordered mode, and it is mostly
 * used to make sure renamed files are fully on disk.
 *
 * It is a noop if the inode is already fully on disk.
 *
 * If trans is not null, we'll do a friendly check for a transaction that
 * is already flushing things and force the IO down ourselves.
 */
1125 1126
void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root, struct inode *inode)
1127
{
1128
	struct btrfs_transaction *cur_trans = trans->transaction;
1129 1130 1131 1132 1133 1134 1135 1136
	u64 last_mod;

	last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);

	/*
	 * if this file hasn't been changed since the last transaction
	 * commit, we can safely return without doing anything
	 */
1137
	if (last_mod <= root->fs_info->last_trans_committed)
1138
		return;
1139

1140
	spin_lock(&root->fs_info->ordered_root_lock);
1141 1142
	if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
		list_add_tail(&BTRFS_I(inode)->ordered_operations,
1143
			      &cur_trans->ordered_operations);
1144
	}
1145
	spin_unlock(&root->fs_info->ordered_root_lock);
1146
}
1147 1148 1149 1150 1151 1152 1153 1154 1155

int __init ordered_data_init(void)
{
	btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
				     sizeof(struct btrfs_ordered_extent), 0,
				     SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
				     NULL);
	if (!btrfs_ordered_extent_cache)
		return -ENOMEM;
1156

1157 1158 1159 1160 1161 1162 1163 1164
	return 0;
}

void ordered_data_exit(void)
{
	if (btrfs_ordered_extent_cache)
		kmem_cache_destroy(btrfs_ordered_extent_cache);
}