ordered-data.c 26.0 KB
Newer Older
C
Chris Mason 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

#include <linux/slab.h>
20
#include <linux/blkdev.h>
21 22
#include <linux/writeback.h>
#include <linux/pagevec.h>
C
Chris Mason 已提交
23 24 25
#include "ctree.h"
#include "transaction.h"
#include "btrfs_inode.h"
26
#include "extent_io.h"
C
Chris Mason 已提交
27

28
static u64 entry_end(struct btrfs_ordered_extent *entry)
C
Chris Mason 已提交
29
{
30 31 32
	if (entry->file_offset + entry->len < entry->file_offset)
		return (u64)-1;
	return entry->file_offset + entry->len;
C
Chris Mason 已提交
33 34
}

C
Chris Mason 已提交
35 36 37
/* returns NULL if the insertion worked, or it returns the node it did find
 * in the tree
 */
38 39
static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
				   struct rb_node *node)
C
Chris Mason 已提交
40
{
C
Chris Mason 已提交
41 42
	struct rb_node **p = &root->rb_node;
	struct rb_node *parent = NULL;
43
	struct btrfs_ordered_extent *entry;
C
Chris Mason 已提交
44

C
Chris Mason 已提交
45
	while (*p) {
C
Chris Mason 已提交
46
		parent = *p;
47
		entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
C
Chris Mason 已提交
48

49
		if (file_offset < entry->file_offset)
C
Chris Mason 已提交
50
			p = &(*p)->rb_left;
51
		else if (file_offset >= entry_end(entry))
C
Chris Mason 已提交
52 53 54 55 56 57 58 59 60 61
			p = &(*p)->rb_right;
		else
			return parent;
	}

	rb_link_node(node, parent, p);
	rb_insert_color(node, root);
	return NULL;
}

62 63 64 65 66 67 68 69
static void ordered_data_tree_panic(struct inode *inode, int errno,
					       u64 offset)
{
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset "
		    "%llu\n", (unsigned long long)offset);
}

C
Chris Mason 已提交
70 71 72 73
/*
 * look for a given offset in the tree, and if it can't be found return the
 * first lesser offset
 */
74 75
static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
				     struct rb_node **prev_ret)
C
Chris Mason 已提交
76
{
C
Chris Mason 已提交
77
	struct rb_node *n = root->rb_node;
C
Chris Mason 已提交
78
	struct rb_node *prev = NULL;
79 80 81
	struct rb_node *test;
	struct btrfs_ordered_extent *entry;
	struct btrfs_ordered_extent *prev_entry = NULL;
C
Chris Mason 已提交
82

C
Chris Mason 已提交
83
	while (n) {
84
		entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
C
Chris Mason 已提交
85 86 87
		prev = n;
		prev_entry = entry;

88
		if (file_offset < entry->file_offset)
C
Chris Mason 已提交
89
			n = n->rb_left;
90
		else if (file_offset >= entry_end(entry))
C
Chris Mason 已提交
91 92 93 94 95 96 97
			n = n->rb_right;
		else
			return n;
	}
	if (!prev_ret)
		return NULL;

C
Chris Mason 已提交
98
	while (prev && file_offset >= entry_end(prev_entry)) {
99 100 101 102 103 104 105 106 107 108 109 110 111
		test = rb_next(prev);
		if (!test)
			break;
		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
				      rb_node);
		if (file_offset < entry_end(prev_entry))
			break;

		prev = test;
	}
	if (prev)
		prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
				      rb_node);
C
Chris Mason 已提交
112
	while (prev && file_offset < entry_end(prev_entry)) {
113 114 115 116 117 118
		test = rb_prev(prev);
		if (!test)
			break;
		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
				      rb_node);
		prev = test;
C
Chris Mason 已提交
119 120 121 122 123
	}
	*prev_ret = prev;
	return NULL;
}

C
Chris Mason 已提交
124 125 126
/*
 * helper to check if a given offset is inside a given entry
 */
127 128 129 130 131 132 133 134
static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
{
	if (file_offset < entry->file_offset ||
	    entry->file_offset + entry->len <= file_offset)
		return 0;
	return 1;
}

135 136 137 138 139 140 141 142 143
static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
			  u64 len)
{
	if (file_offset + len <= entry->file_offset ||
	    entry->file_offset + entry->len <= file_offset)
		return 0;
	return 1;
}

C
Chris Mason 已提交
144 145 146 147
/*
 * look find the first ordered struct that has this offset, otherwise
 * the first one less than this offset
 */
148 149
static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
					  u64 file_offset)
C
Chris Mason 已提交
150
{
151
	struct rb_root *root = &tree->tree;
152
	struct rb_node *prev = NULL;
C
Chris Mason 已提交
153
	struct rb_node *ret;
154 155 156 157 158 159 160 161 162
	struct btrfs_ordered_extent *entry;

	if (tree->last) {
		entry = rb_entry(tree->last, struct btrfs_ordered_extent,
				 rb_node);
		if (offset_in_entry(entry, file_offset))
			return tree->last;
	}
	ret = __tree_search(root, file_offset, &prev);
C
Chris Mason 已提交
163
	if (!ret)
164 165 166
		ret = prev;
	if (ret)
		tree->last = ret;
C
Chris Mason 已提交
167 168 169
	return ret;
}

170 171 172 173 174 175 176 177 178 179 180
/* allocate and add a new ordered_extent into the per-inode tree.
 * file_offset is the logical offset in the file
 *
 * start is the disk block number of an extent already reserved in the
 * extent allocation tree
 *
 * len is the length of the extent
 *
 * The tree is given a single reference on the ordered extent that was
 * inserted.
 */
181 182
static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
				      u64 start, u64 len, u64 disk_len,
183
				      int type, int dio, int compress_type)
C
Chris Mason 已提交
184 185
{
	struct btrfs_ordered_inode_tree *tree;
186 187
	struct rb_node *node;
	struct btrfs_ordered_extent *entry;
C
Chris Mason 已提交
188

189 190
	tree = &BTRFS_I(inode)->ordered_tree;
	entry = kzalloc(sizeof(*entry), GFP_NOFS);
C
Chris Mason 已提交
191 192 193
	if (!entry)
		return -ENOMEM;

194 195 196
	entry->file_offset = file_offset;
	entry->start = start;
	entry->len = len;
C
Chris Mason 已提交
197
	entry->disk_len = disk_len;
198
	entry->bytes_left = len;
199
	entry->inode = inode;
200
	entry->compress_type = compress_type;
Y
Yan Zheng 已提交
201
	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
Y
Yan Zheng 已提交
202
		set_bit(type, &entry->flags);
203

204 205 206
	if (dio)
		set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);

207 208 209 210
	/* one ref for the tree */
	atomic_set(&entry->refs, 1);
	init_waitqueue_head(&entry->wait);
	INIT_LIST_HEAD(&entry->list);
211
	INIT_LIST_HEAD(&entry->root_extent_list);
C
Chris Mason 已提交
212

213 214
	trace_btrfs_ordered_extent_add(inode, entry);

215
	spin_lock(&tree->lock);
216 217
	node = tree_insert(&tree->tree, file_offset,
			   &entry->rb_node);
218 219
	if (node)
		ordered_data_tree_panic(inode, -EEXIST, file_offset);
220
	spin_unlock(&tree->lock);
C
Chris Mason 已提交
221

222 223 224 225 226
	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
	list_add_tail(&entry->root_extent_list,
		      &BTRFS_I(inode)->root->fs_info->ordered_extents);
	spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);

C
Chris Mason 已提交
227 228 229
	return 0;
}

230 231 232 233
int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
			     u64 start, u64 len, u64 disk_len, int type)
{
	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
234 235
					  disk_len, type, 0,
					  BTRFS_COMPRESS_NONE);
236 237 238 239 240 241
}

int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
				 u64 start, u64 len, u64 disk_len, int type)
{
	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
242 243 244 245 246 247 248 249 250 251 252
					  disk_len, type, 1,
					  BTRFS_COMPRESS_NONE);
}

int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
				      u64 start, u64 len, u64 disk_len,
				      int type, int compress_type)
{
	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
					  disk_len, type, 0,
					  compress_type);
253 254
}

255 256
/*
 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
257 258
 * when an ordered extent is finished.  If the list covers more than one
 * ordered extent, it is split across multiples.
259
 */
260 261 262
void btrfs_add_ordered_sum(struct inode *inode,
			   struct btrfs_ordered_extent *entry,
			   struct btrfs_ordered_sum *sum)
C
Chris Mason 已提交
263
{
264
	struct btrfs_ordered_inode_tree *tree;
C
Chris Mason 已提交
265

266
	tree = &BTRFS_I(inode)->ordered_tree;
267
	spin_lock(&tree->lock);
268
	list_add_tail(&sum->list, &entry->list);
269
	spin_unlock(&tree->lock);
C
Chris Mason 已提交
270 271
}

272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338
/*
 * this is used to account for finished IO across a given range
 * of the file.  The IO may span ordered extents.  If
 * a given ordered_extent is completely done, 1 is returned, otherwise
 * 0.
 *
 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
 * to make sure this function only returns 1 once for a given ordered extent.
 *
 * file_offset is updated to one byte past the range that is recorded as
 * complete.  This allows you to walk forward in the file.
 */
int btrfs_dec_test_first_ordered_pending(struct inode *inode,
				   struct btrfs_ordered_extent **cached,
				   u64 *file_offset, u64 io_size)
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;
	int ret;
	u64 dec_end;
	u64 dec_start;
	u64 to_dec;

	tree = &BTRFS_I(inode)->ordered_tree;
	spin_lock(&tree->lock);
	node = tree_search(tree, *file_offset);
	if (!node) {
		ret = 1;
		goto out;
	}

	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
	if (!offset_in_entry(entry, *file_offset)) {
		ret = 1;
		goto out;
	}

	dec_start = max(*file_offset, entry->file_offset);
	dec_end = min(*file_offset + io_size, entry->file_offset +
		      entry->len);
	*file_offset = dec_end;
	if (dec_start > dec_end) {
		printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n",
		       (unsigned long long)dec_start,
		       (unsigned long long)dec_end);
	}
	to_dec = dec_end - dec_start;
	if (to_dec > entry->bytes_left) {
		printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
		       (unsigned long long)entry->bytes_left,
		       (unsigned long long)to_dec);
	}
	entry->bytes_left -= to_dec;
	if (entry->bytes_left == 0)
		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
	else
		ret = 1;
out:
	if (!ret && cached && entry) {
		*cached = entry;
		atomic_inc(&entry->refs);
	}
	spin_unlock(&tree->lock);
	return ret == 0;
}

339 340 341 342 343 344 345 346 347
/*
 * this is used to account for finished IO across a given range
 * of the file.  The IO should not span ordered extents.  If
 * a given ordered_extent is completely done, 1 is returned, otherwise
 * 0.
 *
 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
 * to make sure this function only returns 1 once for a given ordered extent.
 */
348
int btrfs_dec_test_ordered_pending(struct inode *inode,
349
				   struct btrfs_ordered_extent **cached,
350
				   u64 file_offset, u64 io_size)
C
Chris Mason 已提交
351
{
352
	struct btrfs_ordered_inode_tree *tree;
C
Chris Mason 已提交
353
	struct rb_node *node;
354
	struct btrfs_ordered_extent *entry = NULL;
355 356 357
	int ret;

	tree = &BTRFS_I(inode)->ordered_tree;
358
	spin_lock(&tree->lock);
359
	node = tree_search(tree, file_offset);
C
Chris Mason 已提交
360
	if (!node) {
361 362
		ret = 1;
		goto out;
C
Chris Mason 已提交
363 364
	}

365 366 367 368
	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
	if (!offset_in_entry(entry, file_offset)) {
		ret = 1;
		goto out;
C
Chris Mason 已提交
369
	}
370

371 372 373 374 375 376 377
	if (io_size > entry->bytes_left) {
		printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
		       (unsigned long long)entry->bytes_left,
		       (unsigned long long)io_size);
	}
	entry->bytes_left -= io_size;
	if (entry->bytes_left == 0)
378
		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
379 380
	else
		ret = 1;
381
out:
382 383 384 385
	if (!ret && cached && entry) {
		*cached = entry;
		atomic_inc(&entry->refs);
	}
386
	spin_unlock(&tree->lock);
387 388
	return ret == 0;
}
C
Chris Mason 已提交
389

390 391 392 393
/*
 * used to drop a reference on an ordered extent.  This will free
 * the extent if the last reference is dropped
 */
394
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
395
{
396 397 398
	struct list_head *cur;
	struct btrfs_ordered_sum *sum;

399 400
	trace_btrfs_ordered_extent_put(entry->inode, entry);

401
	if (atomic_dec_and_test(&entry->refs)) {
C
Chris Mason 已提交
402
		while (!list_empty(&entry->list)) {
403 404 405 406 407
			cur = entry->list.next;
			sum = list_entry(cur, struct btrfs_ordered_sum, list);
			list_del(&sum->list);
			kfree(sum);
		}
408
		kfree(entry);
409
	}
C
Chris Mason 已提交
410
}
411

412 413
/*
 * remove an ordered extent from the tree.  No references are dropped
414
 * and you must wake_up entry->wait.  You must hold the tree lock
415
 * while you call this function.
416
 */
417 418
static void __btrfs_remove_ordered_extent(struct inode *inode,
					  struct btrfs_ordered_extent *entry)
419
{
420
	struct btrfs_ordered_inode_tree *tree;
421
	struct btrfs_root *root = BTRFS_I(inode)->root;
422 423
	struct rb_node *node;

424 425
	tree = &BTRFS_I(inode)->ordered_tree;
	node = &entry->rb_node;
426
	rb_erase(node, &tree->tree);
427 428
	tree->last = NULL;
	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
429

430
	spin_lock(&root->fs_info->ordered_extent_lock);
431
	list_del_init(&entry->root_extent_list);
432

433 434
	trace_btrfs_ordered_extent_remove(inode, entry);

435 436 437 438 439 440 441 442 443
	/*
	 * we have no more ordered extents for this inode and
	 * no dirty pages.  We can safely remove it from the
	 * list of ordered extents
	 */
	if (RB_EMPTY_ROOT(&tree->tree) &&
	    !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
		list_del_init(&BTRFS_I(inode)->ordered_operations);
	}
444
	spin_unlock(&root->fs_info->ordered_extent_lock);
445 446 447 448 449 450
}

/*
 * remove an ordered extent from the tree.  No references are dropped
 * but any waiters are woken.
 */
451 452
void btrfs_remove_ordered_extent(struct inode *inode,
				 struct btrfs_ordered_extent *entry)
453 454 455 456
{
	struct btrfs_ordered_inode_tree *tree;

	tree = &BTRFS_I(inode)->ordered_tree;
457
	spin_lock(&tree->lock);
458
	__btrfs_remove_ordered_extent(inode, entry);
459
	spin_unlock(&tree->lock);
460
	wake_up(&entry->wait);
461 462
}

C
Chris Mason 已提交
463 464 465 466
/*
 * wait for all the ordered extents in a root.  This is done when balancing
 * space between drives.
 */
467 468
void btrfs_wait_ordered_extents(struct btrfs_root *root,
				int nocow_only, int delay_iput)
469 470 471 472 473 474 475 476 477 478
{
	struct list_head splice;
	struct list_head *cur;
	struct btrfs_ordered_extent *ordered;
	struct inode *inode;

	INIT_LIST_HEAD(&splice);

	spin_lock(&root->fs_info->ordered_extent_lock);
	list_splice_init(&root->fs_info->ordered_extents, &splice);
479
	while (!list_empty(&splice)) {
480 481 482
		cur = splice.next;
		ordered = list_entry(cur, struct btrfs_ordered_extent,
				     root_extent_list);
483
		if (nocow_only &&
Y
Yan Zheng 已提交
484 485
		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
486 487
			list_move(&ordered->root_extent_list,
				  &root->fs_info->ordered_extents);
488 489 490 491
			cond_resched_lock(&root->fs_info->ordered_extent_lock);
			continue;
		}

492 493 494 495
		list_del_init(&ordered->root_extent_list);
		atomic_inc(&ordered->refs);

		/*
496
		 * the inode may be getting freed (in sys_unlink path).
497
		 */
498 499
		inode = igrab(ordered->inode);

500 501
		spin_unlock(&root->fs_info->ordered_extent_lock);

502 503 504
		if (inode) {
			btrfs_start_ordered_extent(inode, ordered, 1);
			btrfs_put_ordered_extent(ordered);
Y
Yan, Zheng 已提交
505 506 507 508
			if (delay_iput)
				btrfs_add_delayed_iput(inode);
			else
				iput(inode);
509 510 511
		} else {
			btrfs_put_ordered_extent(ordered);
		}
512 513 514 515 516 517

		spin_lock(&root->fs_info->ordered_extent_lock);
	}
	spin_unlock(&root->fs_info->ordered_extent_lock);
}

518 519 520 521 522 523 524 525 526 527
/*
 * this is used during transaction commit to write all the inodes
 * added to the ordered operation list.  These files must be fully on
 * disk before the transaction commits.
 *
 * we have two modes here, one is to just start the IO via filemap_flush
 * and the other is to wait for all the io.  When we wait, we have an
 * extra check to make sure the ordered operation list really is empty
 * before we return
 */
528
void btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564
{
	struct btrfs_inode *btrfs_inode;
	struct inode *inode;
	struct list_head splice;

	INIT_LIST_HEAD(&splice);

	mutex_lock(&root->fs_info->ordered_operations_mutex);
	spin_lock(&root->fs_info->ordered_extent_lock);
again:
	list_splice_init(&root->fs_info->ordered_operations, &splice);

	while (!list_empty(&splice)) {
		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
				   ordered_operations);

		inode = &btrfs_inode->vfs_inode;

		list_del_init(&btrfs_inode->ordered_operations);

		/*
		 * the inode may be getting freed (in sys_unlink path).
		 */
		inode = igrab(inode);

		if (!wait && inode) {
			list_add_tail(&BTRFS_I(inode)->ordered_operations,
			      &root->fs_info->ordered_operations);
		}
		spin_unlock(&root->fs_info->ordered_extent_lock);

		if (inode) {
			if (wait)
				btrfs_wait_ordered_range(inode, 0, (u64)-1);
			else
				filemap_flush(inode->i_mapping);
Y
Yan, Zheng 已提交
565
			btrfs_add_delayed_iput(inode);
566 567 568 569 570 571 572 573 574 575 576 577
		}

		cond_resched();
		spin_lock(&root->fs_info->ordered_extent_lock);
	}
	if (wait && !list_empty(&root->fs_info->ordered_operations))
		goto again;

	spin_unlock(&root->fs_info->ordered_extent_lock);
	mutex_unlock(&root->fs_info->ordered_operations_mutex);
}

578 579 580 581 582 583 584 585 586 587
/*
 * Used to start IO or wait for a given ordered extent to finish.
 *
 * If wait is one, this effectively waits on page writeback for all the pages
 * in the extent, and it waits on the io completion code to insert
 * metadata into the btree corresponding to the extent
 */
void btrfs_start_ordered_extent(struct inode *inode,
				       struct btrfs_ordered_extent *entry,
				       int wait)
588 589 590
{
	u64 start = entry->file_offset;
	u64 end = start + entry->len - 1;
591

592 593
	trace_btrfs_ordered_extent_start(inode, entry);

594 595 596 597 598
	/*
	 * pages in the range can be dirty, clean or writeback.  We
	 * start IO on any dirty ones so the wait doesn't stall waiting
	 * for pdflush to find them
	 */
599 600
	if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
		filemap_fdatawrite_range(inode->i_mapping, start, end);
C
Chris Mason 已提交
601
	if (wait) {
602 603
		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
						 &entry->flags));
C
Chris Mason 已提交
604
	}
605
}
606

607 608 609
/*
 * Used to wait on ordered extents across a large range of bytes.
 */
610
void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
611 612
{
	u64 end;
613
	u64 orig_end;
614
	struct btrfs_ordered_extent *ordered;
615
	int found;
616 617

	if (start + len < start) {
618
		orig_end = INT_LIMIT(loff_t);
619 620
	} else {
		orig_end = start + len - 1;
621 622
		if (orig_end > INT_LIMIT(loff_t))
			orig_end = INT_LIMIT(loff_t);
623
	}
C
Chris Mason 已提交
624
again:
625 626 627
	/* start IO across the range first to instantiate any delalloc
	 * extents
	 */
628
	filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
629

630 631 632 633
	/* The compression code will leave pages locked but return from
	 * writepage without setting the page writeback.  Starting again
	 * with WB_SYNC_ALL will end up waiting for the IO to actually start.
	 */
634
	filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
635

636
	filemap_fdatawait_range(inode->i_mapping, start, orig_end);
637

638
	end = orig_end;
639
	found = 0;
C
Chris Mason 已提交
640
	while (1) {
641
		ordered = btrfs_lookup_first_ordered_extent(inode, end);
C
Chris Mason 已提交
642
		if (!ordered)
643
			break;
644
		if (ordered->file_offset > orig_end) {
645 646 647 648 649 650 651
			btrfs_put_ordered_extent(ordered);
			break;
		}
		if (ordered->file_offset + ordered->len < start) {
			btrfs_put_ordered_extent(ordered);
			break;
		}
652
		found++;
653
		btrfs_start_ordered_extent(inode, ordered, 1);
654 655
		end = ordered->file_offset;
		btrfs_put_ordered_extent(ordered);
656
		if (end == 0 || end == start)
657 658 659
			break;
		end--;
	}
660 661
	if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
			   EXTENT_DELALLOC, 0, NULL)) {
662
		schedule_timeout(1);
C
Chris Mason 已提交
663 664
		goto again;
	}
665 666
}

667 668 669 670
/*
 * find an ordered extent corresponding to file_offset.  return NULL if
 * nothing is found, otherwise take a reference on the extent and return it
 */
671 672 673 674 675 676 677 678
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
							 u64 file_offset)
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;

	tree = &BTRFS_I(inode)->ordered_tree;
679
	spin_lock(&tree->lock);
680 681 682 683 684 685 686 687 688 689
	node = tree_search(tree, file_offset);
	if (!node)
		goto out;

	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
	if (!offset_in_entry(entry, file_offset))
		entry = NULL;
	if (entry)
		atomic_inc(&entry->refs);
out:
690
	spin_unlock(&tree->lock);
691 692 693
	return entry;
}

694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734
/* Since the DIO code tries to lock a wide area we need to look for any ordered
 * extents that exist in the range, rather than just the start of the range.
 */
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
							u64 file_offset,
							u64 len)
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;

	tree = &BTRFS_I(inode)->ordered_tree;
	spin_lock(&tree->lock);
	node = tree_search(tree, file_offset);
	if (!node) {
		node = tree_search(tree, file_offset + len);
		if (!node)
			goto out;
	}

	while (1) {
		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
		if (range_overlaps(entry, file_offset, len))
			break;

		if (entry->file_offset >= file_offset + len) {
			entry = NULL;
			break;
		}
		entry = NULL;
		node = rb_next(node);
		if (!node)
			break;
	}
out:
	if (entry)
		atomic_inc(&entry->refs);
	spin_unlock(&tree->lock);
	return entry;
}

735 736 737 738
/*
 * lookup and return any extent before 'file_offset'.  NULL is returned
 * if none is found
 */
739
struct btrfs_ordered_extent *
C
Chris Mason 已提交
740
btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
741 742 743 744 745 746
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;

	tree = &BTRFS_I(inode)->ordered_tree;
747
	spin_lock(&tree->lock);
748 749 750 751 752 753 754
	node = tree_search(tree, file_offset);
	if (!node)
		goto out;

	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
	atomic_inc(&entry->refs);
out:
755
	spin_unlock(&tree->lock);
756
	return entry;
757
}
758

759 760 761 762
/*
 * After an extent is done, call this to conditionally update the on disk
 * i_size.  i_size is updated to cover any fully written part of the file.
 */
763
int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
764 765 766 767 768 769 770
				struct btrfs_ordered_extent *ordered)
{
	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
	u64 disk_i_size;
	u64 new_i_size;
	u64 i_size_test;
771
	u64 i_size = i_size_read(inode);
772
	struct rb_node *node;
773
	struct rb_node *prev = NULL;
774
	struct btrfs_ordered_extent *test;
775 776 777 778
	int ret = 1;

	if (ordered)
		offset = entry_end(ordered);
779 780
	else
		offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
781

782
	spin_lock(&tree->lock);
783 784
	disk_i_size = BTRFS_I(inode)->disk_i_size;

785 786 787 788 789 790 791
	/* truncate file */
	if (disk_i_size > i_size) {
		BTRFS_I(inode)->disk_i_size = i_size;
		ret = 0;
		goto out;
	}

792 793 794 795
	/*
	 * if the disk i_size is already at the inode->i_size, or
	 * this ordered extent is inside the disk i_size, we're done
	 */
796
	if (disk_i_size == i_size || offset <= disk_i_size) {
797 798 799 800 801 802 803
		goto out;
	}

	/*
	 * we can't update the disk_isize if there are delalloc bytes
	 * between disk_i_size and  this ordered extent
	 */
804
	if (test_range_bit(io_tree, disk_i_size, offset - 1,
805
			   EXTENT_DELALLOC, 0, NULL)) {
806 807 808 809 810 811 812
		goto out;
	}
	/*
	 * walk backward from this ordered extent to disk_i_size.
	 * if we find an ordered extent then we can't update disk i_size
	 * yet
	 */
813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828
	if (ordered) {
		node = rb_prev(&ordered->rb_node);
	} else {
		prev = tree_search(tree, offset);
		/*
		 * we insert file extents without involving ordered struct,
		 * so there should be no ordered struct cover this offset
		 */
		if (prev) {
			test = rb_entry(prev, struct btrfs_ordered_extent,
					rb_node);
			BUG_ON(offset_in_entry(test, offset));
		}
		node = prev;
	}
	while (node) {
829 830 831
		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
		if (test->file_offset + test->len <= disk_i_size)
			break;
832
		if (test->file_offset >= i_size)
833 834 835
			break;
		if (test->file_offset >= disk_i_size)
			goto out;
836
		node = rb_prev(node);
837
	}
838
	new_i_size = min_t(u64, offset, i_size);
839 840 841 842 843 844 845

	/*
	 * at this point, we know we can safely update i_size to at least
	 * the offset from this ordered extent.  But, we need to
	 * walk forward and see if ios from higher up in the file have
	 * finished.
	 */
846 847 848 849 850 851 852 853
	if (ordered) {
		node = rb_next(&ordered->rb_node);
	} else {
		if (prev)
			node = rb_next(prev);
		else
			node = rb_first(&tree->tree);
	}
854 855 856 857 858 859 860
	i_size_test = 0;
	if (node) {
		/*
		 * do we have an area where IO might have finished
		 * between our ordered extent and the next one.
		 */
		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
861
		if (test->file_offset > offset)
Y
Yan Zheng 已提交
862
			i_size_test = test->file_offset;
863
	} else {
864
		i_size_test = i_size;
865 866 867 868 869 870 871 872
	}

	/*
	 * i_size_test is the end of a region after this ordered
	 * extent where there are no ordered extents.  As long as there
	 * are no delalloc bytes in this area, it is safe to update
	 * disk_i_size to the end of the region.
	 */
873 874 875 876
	if (i_size_test > offset &&
	    !test_range_bit(io_tree, offset, i_size_test - 1,
			    EXTENT_DELALLOC, 0, NULL)) {
		new_i_size = min_t(u64, i_size_test, i_size);
877 878
	}
	BTRFS_I(inode)->disk_i_size = new_i_size;
879
	ret = 0;
880
out:
881 882 883 884 885 886 887
	/*
	 * we need to remove the ordered extent with the tree lock held
	 * so that other people calling this function don't find our fully
	 * processed ordered entry and skip updating the i_size
	 */
	if (ordered)
		__btrfs_remove_ordered_extent(inode, ordered);
888
	spin_unlock(&tree->lock);
889 890 891
	if (ordered)
		wake_up(&ordered->wait);
	return ret;
892
}
893

894 895 896 897 898
/*
 * search the ordered extents for one corresponding to 'offset' and
 * try to find a checksum.  This is used because we allow pages to
 * be reclaimed before their checksum is actually put into the btree
 */
899 900
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
			   u32 *sum)
901 902 903 904 905
{
	struct btrfs_ordered_sum *ordered_sum;
	struct btrfs_sector_sum *sector_sums;
	struct btrfs_ordered_extent *ordered;
	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
906 907 908
	unsigned long num_sectors;
	unsigned long i;
	u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
909 910 911 912 913 914
	int ret = 1;

	ordered = btrfs_lookup_ordered_extent(inode, offset);
	if (!ordered)
		return 1;

915
	spin_lock(&tree->lock);
Q
Qinghuang Feng 已提交
916
	list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
917
		if (disk_bytenr >= ordered_sum->bytenr) {
918
			num_sectors = ordered_sum->len / sectorsize;
919
			sector_sums = ordered_sum->sums;
920
			for (i = 0; i < num_sectors; i++) {
921
				if (sector_sums[i].bytenr == disk_bytenr) {
922 923 924 925 926
					*sum = sector_sums[i].sum;
					ret = 0;
					goto out;
				}
			}
927 928 929
		}
	}
out:
930
	spin_unlock(&tree->lock);
931
	btrfs_put_ordered_extent(ordered);
932 933 934
	return ret;
}

935

936 937 938 939 940 941 942 943 944 945 946 947
/*
 * add a given inode to the list of inodes that must be fully on
 * disk before a transaction commit finishes.
 *
 * This basically gives us the ext3 style data=ordered mode, and it is mostly
 * used to make sure renamed files are fully on disk.
 *
 * It is a noop if the inode is already fully on disk.
 *
 * If trans is not null, we'll do a friendly check for a transaction that
 * is already flushing things and force the IO down ourselves.
 */
948 949
void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root, struct inode *inode)
950 951 952 953 954 955 956 957 958 959
{
	u64 last_mod;

	last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);

	/*
	 * if this file hasn't been changed since the last transaction
	 * commit, we can safely return without doing anything
	 */
	if (last_mod < root->fs_info->last_trans_committed)
960
		return;
961 962 963 964 965 966 967

	/*
	 * the transaction is already committing.  Just start the IO and
	 * don't bother with all of this list nonsense
	 */
	if (trans && root->fs_info->running_transaction->blocked) {
		btrfs_wait_ordered_range(inode, 0, (u64)-1);
968
		return;
969 970 971 972 973 974 975 976 977
	}

	spin_lock(&root->fs_info->ordered_extent_lock);
	if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
		list_add_tail(&BTRFS_I(inode)->ordered_operations,
			      &root->fs_info->ordered_operations);
	}
	spin_unlock(&root->fs_info->ordered_extent_lock);
}