ordered-data.c 25.5 KB
Newer Older
C
Chris Mason 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

#include <linux/slab.h>
20
#include <linux/blkdev.h>
21 22
#include <linux/writeback.h>
#include <linux/pagevec.h>
C
Chris Mason 已提交
23 24 25
#include "ctree.h"
#include "transaction.h"
#include "btrfs_inode.h"
26
#include "extent_io.h"
C
Chris Mason 已提交
27

28
static u64 entry_end(struct btrfs_ordered_extent *entry)
C
Chris Mason 已提交
29
{
30 31 32
	if (entry->file_offset + entry->len < entry->file_offset)
		return (u64)-1;
	return entry->file_offset + entry->len;
C
Chris Mason 已提交
33 34
}

C
Chris Mason 已提交
35 36 37
/* returns NULL if the insertion worked, or it returns the node it did find
 * in the tree
 */
38 39
static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
				   struct rb_node *node)
C
Chris Mason 已提交
40
{
C
Chris Mason 已提交
41 42
	struct rb_node **p = &root->rb_node;
	struct rb_node *parent = NULL;
43
	struct btrfs_ordered_extent *entry;
C
Chris Mason 已提交
44

C
Chris Mason 已提交
45
	while (*p) {
C
Chris Mason 已提交
46
		parent = *p;
47
		entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
C
Chris Mason 已提交
48

49
		if (file_offset < entry->file_offset)
C
Chris Mason 已提交
50
			p = &(*p)->rb_left;
51
		else if (file_offset >= entry_end(entry))
C
Chris Mason 已提交
52 53 54 55 56 57 58 59 60 61
			p = &(*p)->rb_right;
		else
			return parent;
	}

	rb_link_node(node, parent, p);
	rb_insert_color(node, root);
	return NULL;
}

62 63 64 65 66 67 68 69
static void ordered_data_tree_panic(struct inode *inode, int errno,
					       u64 offset)
{
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset "
		    "%llu\n", (unsigned long long)offset);
}

C
Chris Mason 已提交
70 71 72 73
/*
 * look for a given offset in the tree, and if it can't be found return the
 * first lesser offset
 */
74 75
static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
				     struct rb_node **prev_ret)
C
Chris Mason 已提交
76
{
C
Chris Mason 已提交
77
	struct rb_node *n = root->rb_node;
C
Chris Mason 已提交
78
	struct rb_node *prev = NULL;
79 80 81
	struct rb_node *test;
	struct btrfs_ordered_extent *entry;
	struct btrfs_ordered_extent *prev_entry = NULL;
C
Chris Mason 已提交
82

C
Chris Mason 已提交
83
	while (n) {
84
		entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
C
Chris Mason 已提交
85 86 87
		prev = n;
		prev_entry = entry;

88
		if (file_offset < entry->file_offset)
C
Chris Mason 已提交
89
			n = n->rb_left;
90
		else if (file_offset >= entry_end(entry))
C
Chris Mason 已提交
91 92 93 94 95 96 97
			n = n->rb_right;
		else
			return n;
	}
	if (!prev_ret)
		return NULL;

C
Chris Mason 已提交
98
	while (prev && file_offset >= entry_end(prev_entry)) {
99 100 101 102 103 104 105 106 107 108 109 110 111
		test = rb_next(prev);
		if (!test)
			break;
		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
				      rb_node);
		if (file_offset < entry_end(prev_entry))
			break;

		prev = test;
	}
	if (prev)
		prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
				      rb_node);
C
Chris Mason 已提交
112
	while (prev && file_offset < entry_end(prev_entry)) {
113 114 115 116 117 118
		test = rb_prev(prev);
		if (!test)
			break;
		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
				      rb_node);
		prev = test;
C
Chris Mason 已提交
119 120 121 122 123
	}
	*prev_ret = prev;
	return NULL;
}

C
Chris Mason 已提交
124 125 126
/*
 * helper to check if a given offset is inside a given entry
 */
127 128 129 130 131 132 133 134
static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
{
	if (file_offset < entry->file_offset ||
	    entry->file_offset + entry->len <= file_offset)
		return 0;
	return 1;
}

135 136 137 138 139 140 141 142 143
static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
			  u64 len)
{
	if (file_offset + len <= entry->file_offset ||
	    entry->file_offset + entry->len <= file_offset)
		return 0;
	return 1;
}

C
Chris Mason 已提交
144 145 146 147
/*
 * look find the first ordered struct that has this offset, otherwise
 * the first one less than this offset
 */
148 149
static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
					  u64 file_offset)
C
Chris Mason 已提交
150
{
151
	struct rb_root *root = &tree->tree;
152
	struct rb_node *prev = NULL;
C
Chris Mason 已提交
153
	struct rb_node *ret;
154 155 156 157 158 159 160 161 162
	struct btrfs_ordered_extent *entry;

	if (tree->last) {
		entry = rb_entry(tree->last, struct btrfs_ordered_extent,
				 rb_node);
		if (offset_in_entry(entry, file_offset))
			return tree->last;
	}
	ret = __tree_search(root, file_offset, &prev);
C
Chris Mason 已提交
163
	if (!ret)
164 165 166
		ret = prev;
	if (ret)
		tree->last = ret;
C
Chris Mason 已提交
167 168 169
	return ret;
}

170 171 172 173 174 175 176 177 178 179 180
/* allocate and add a new ordered_extent into the per-inode tree.
 * file_offset is the logical offset in the file
 *
 * start is the disk block number of an extent already reserved in the
 * extent allocation tree
 *
 * len is the length of the extent
 *
 * The tree is given a single reference on the ordered extent that was
 * inserted.
 */
181 182
static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
				      u64 start, u64 len, u64 disk_len,
183
				      int type, int dio, int compress_type)
C
Chris Mason 已提交
184 185
{
	struct btrfs_ordered_inode_tree *tree;
186 187
	struct rb_node *node;
	struct btrfs_ordered_extent *entry;
C
Chris Mason 已提交
188

189 190
	tree = &BTRFS_I(inode)->ordered_tree;
	entry = kzalloc(sizeof(*entry), GFP_NOFS);
C
Chris Mason 已提交
191 192 193
	if (!entry)
		return -ENOMEM;

194 195 196
	entry->file_offset = file_offset;
	entry->start = start;
	entry->len = len;
C
Chris Mason 已提交
197
	entry->disk_len = disk_len;
198
	entry->bytes_left = len;
199
	entry->inode = inode;
200
	entry->compress_type = compress_type;
Y
Yan Zheng 已提交
201
	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
Y
Yan Zheng 已提交
202
		set_bit(type, &entry->flags);
203

204 205 206
	if (dio)
		set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);

207 208 209 210
	/* one ref for the tree */
	atomic_set(&entry->refs, 1);
	init_waitqueue_head(&entry->wait);
	INIT_LIST_HEAD(&entry->list);
211
	INIT_LIST_HEAD(&entry->root_extent_list);
C
Chris Mason 已提交
212

213 214
	trace_btrfs_ordered_extent_add(inode, entry);

215
	spin_lock(&tree->lock);
216 217
	node = tree_insert(&tree->tree, file_offset,
			   &entry->rb_node);
218 219
	if (node)
		ordered_data_tree_panic(inode, -EEXIST, file_offset);
220
	spin_unlock(&tree->lock);
C
Chris Mason 已提交
221

222 223 224 225 226
	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
	list_add_tail(&entry->root_extent_list,
		      &BTRFS_I(inode)->root->fs_info->ordered_extents);
	spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);

C
Chris Mason 已提交
227 228 229
	return 0;
}

230 231 232 233
int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
			     u64 start, u64 len, u64 disk_len, int type)
{
	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
234 235
					  disk_len, type, 0,
					  BTRFS_COMPRESS_NONE);
236 237 238 239 240 241
}

int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
				 u64 start, u64 len, u64 disk_len, int type)
{
	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
242 243 244 245 246 247 248 249 250 251 252
					  disk_len, type, 1,
					  BTRFS_COMPRESS_NONE);
}

int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
				      u64 start, u64 len, u64 disk_len,
				      int type, int compress_type)
{
	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
					  disk_len, type, 0,
					  compress_type);
253 254
}

255 256
/*
 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
257 258
 * when an ordered extent is finished.  If the list covers more than one
 * ordered extent, it is split across multiples.
259
 */
260 261 262
void btrfs_add_ordered_sum(struct inode *inode,
			   struct btrfs_ordered_extent *entry,
			   struct btrfs_ordered_sum *sum)
C
Chris Mason 已提交
263
{
264
	struct btrfs_ordered_inode_tree *tree;
C
Chris Mason 已提交
265

266
	tree = &BTRFS_I(inode)->ordered_tree;
267
	spin_lock(&tree->lock);
268
	list_add_tail(&sum->list, &entry->list);
269
	spin_unlock(&tree->lock);
C
Chris Mason 已提交
270 271
}

272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338
/*
 * this is used to account for finished IO across a given range
 * of the file.  The IO may span ordered extents.  If
 * a given ordered_extent is completely done, 1 is returned, otherwise
 * 0.
 *
 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
 * to make sure this function only returns 1 once for a given ordered extent.
 *
 * file_offset is updated to one byte past the range that is recorded as
 * complete.  This allows you to walk forward in the file.
 */
int btrfs_dec_test_first_ordered_pending(struct inode *inode,
				   struct btrfs_ordered_extent **cached,
				   u64 *file_offset, u64 io_size)
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;
	int ret;
	u64 dec_end;
	u64 dec_start;
	u64 to_dec;

	tree = &BTRFS_I(inode)->ordered_tree;
	spin_lock(&tree->lock);
	node = tree_search(tree, *file_offset);
	if (!node) {
		ret = 1;
		goto out;
	}

	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
	if (!offset_in_entry(entry, *file_offset)) {
		ret = 1;
		goto out;
	}

	dec_start = max(*file_offset, entry->file_offset);
	dec_end = min(*file_offset + io_size, entry->file_offset +
		      entry->len);
	*file_offset = dec_end;
	if (dec_start > dec_end) {
		printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n",
		       (unsigned long long)dec_start,
		       (unsigned long long)dec_end);
	}
	to_dec = dec_end - dec_start;
	if (to_dec > entry->bytes_left) {
		printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
		       (unsigned long long)entry->bytes_left,
		       (unsigned long long)to_dec);
	}
	entry->bytes_left -= to_dec;
	if (entry->bytes_left == 0)
		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
	else
		ret = 1;
out:
	if (!ret && cached && entry) {
		*cached = entry;
		atomic_inc(&entry->refs);
	}
	spin_unlock(&tree->lock);
	return ret == 0;
}

339 340 341 342 343 344 345 346 347
/*
 * this is used to account for finished IO across a given range
 * of the file.  The IO should not span ordered extents.  If
 * a given ordered_extent is completely done, 1 is returned, otherwise
 * 0.
 *
 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
 * to make sure this function only returns 1 once for a given ordered extent.
 */
348
int btrfs_dec_test_ordered_pending(struct inode *inode,
349
				   struct btrfs_ordered_extent **cached,
350
				   u64 file_offset, u64 io_size)
C
Chris Mason 已提交
351
{
352
	struct btrfs_ordered_inode_tree *tree;
C
Chris Mason 已提交
353
	struct rb_node *node;
354
	struct btrfs_ordered_extent *entry = NULL;
355 356 357
	int ret;

	tree = &BTRFS_I(inode)->ordered_tree;
358
	spin_lock(&tree->lock);
359
	node = tree_search(tree, file_offset);
C
Chris Mason 已提交
360
	if (!node) {
361 362
		ret = 1;
		goto out;
C
Chris Mason 已提交
363 364
	}

365 366 367 368
	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
	if (!offset_in_entry(entry, file_offset)) {
		ret = 1;
		goto out;
C
Chris Mason 已提交
369
	}
370

371 372 373 374 375 376 377
	if (io_size > entry->bytes_left) {
		printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
		       (unsigned long long)entry->bytes_left,
		       (unsigned long long)io_size);
	}
	entry->bytes_left -= io_size;
	if (entry->bytes_left == 0)
378
		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
379 380
	else
		ret = 1;
381
out:
382 383 384 385
	if (!ret && cached && entry) {
		*cached = entry;
		atomic_inc(&entry->refs);
	}
386
	spin_unlock(&tree->lock);
387 388
	return ret == 0;
}
C
Chris Mason 已提交
389

390 391 392 393
/*
 * used to drop a reference on an ordered extent.  This will free
 * the extent if the last reference is dropped
 */
394
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
395
{
396 397 398
	struct list_head *cur;
	struct btrfs_ordered_sum *sum;

399 400
	trace_btrfs_ordered_extent_put(entry->inode, entry);

401
	if (atomic_dec_and_test(&entry->refs)) {
C
Chris Mason 已提交
402
		while (!list_empty(&entry->list)) {
403 404 405 406 407
			cur = entry->list.next;
			sum = list_entry(cur, struct btrfs_ordered_sum, list);
			list_del(&sum->list);
			kfree(sum);
		}
408
		kfree(entry);
409
	}
C
Chris Mason 已提交
410
}
411

412 413
/*
 * remove an ordered extent from the tree.  No references are dropped
414
 * and you must wake_up entry->wait.  You must hold the tree lock
415
 * while you call this function.
416
 */
417 418
static void __btrfs_remove_ordered_extent(struct inode *inode,
					  struct btrfs_ordered_extent *entry)
419
{
420
	struct btrfs_ordered_inode_tree *tree;
421
	struct btrfs_root *root = BTRFS_I(inode)->root;
422 423
	struct rb_node *node;

424 425
	tree = &BTRFS_I(inode)->ordered_tree;
	node = &entry->rb_node;
426
	rb_erase(node, &tree->tree);
427 428
	tree->last = NULL;
	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
429

430
	spin_lock(&root->fs_info->ordered_extent_lock);
431
	list_del_init(&entry->root_extent_list);
432

433 434
	trace_btrfs_ordered_extent_remove(inode, entry);

435 436 437 438 439 440 441 442 443
	/*
	 * we have no more ordered extents for this inode and
	 * no dirty pages.  We can safely remove it from the
	 * list of ordered extents
	 */
	if (RB_EMPTY_ROOT(&tree->tree) &&
	    !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
		list_del_init(&BTRFS_I(inode)->ordered_operations);
	}
444
	spin_unlock(&root->fs_info->ordered_extent_lock);
445 446 447 448 449 450
}

/*
 * remove an ordered extent from the tree.  No references are dropped
 * but any waiters are woken.
 */
451 452
void btrfs_remove_ordered_extent(struct inode *inode,
				 struct btrfs_ordered_extent *entry)
453 454 455 456
{
	struct btrfs_ordered_inode_tree *tree;

	tree = &BTRFS_I(inode)->ordered_tree;
457
	spin_lock(&tree->lock);
458
	__btrfs_remove_ordered_extent(inode, entry);
459
	spin_unlock(&tree->lock);
460
	wake_up(&entry->wait);
461 462
}

C
Chris Mason 已提交
463 464 465 466
/*
 * wait for all the ordered extents in a root.  This is done when balancing
 * space between drives.
 */
467 468
void btrfs_wait_ordered_extents(struct btrfs_root *root,
				int nocow_only, int delay_iput)
469 470 471 472 473 474 475 476 477 478
{
	struct list_head splice;
	struct list_head *cur;
	struct btrfs_ordered_extent *ordered;
	struct inode *inode;

	INIT_LIST_HEAD(&splice);

	spin_lock(&root->fs_info->ordered_extent_lock);
	list_splice_init(&root->fs_info->ordered_extents, &splice);
479
	while (!list_empty(&splice)) {
480 481 482
		cur = splice.next;
		ordered = list_entry(cur, struct btrfs_ordered_extent,
				     root_extent_list);
483
		if (nocow_only &&
Y
Yan Zheng 已提交
484 485
		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
486 487
			list_move(&ordered->root_extent_list,
				  &root->fs_info->ordered_extents);
488 489 490 491
			cond_resched_lock(&root->fs_info->ordered_extent_lock);
			continue;
		}

492 493 494 495
		list_del_init(&ordered->root_extent_list);
		atomic_inc(&ordered->refs);

		/*
496
		 * the inode may be getting freed (in sys_unlink path).
497
		 */
498 499
		inode = igrab(ordered->inode);

500 501
		spin_unlock(&root->fs_info->ordered_extent_lock);

502 503 504
		if (inode) {
			btrfs_start_ordered_extent(inode, ordered, 1);
			btrfs_put_ordered_extent(ordered);
Y
Yan, Zheng 已提交
505 506 507 508
			if (delay_iput)
				btrfs_add_delayed_iput(inode);
			else
				iput(inode);
509 510 511
		} else {
			btrfs_put_ordered_extent(ordered);
		}
512 513 514 515 516 517

		spin_lock(&root->fs_info->ordered_extent_lock);
	}
	spin_unlock(&root->fs_info->ordered_extent_lock);
}

518 519 520 521 522 523 524 525 526 527
/*
 * this is used during transaction commit to write all the inodes
 * added to the ordered operation list.  These files must be fully on
 * disk before the transaction commits.
 *
 * we have two modes here, one is to just start the IO via filemap_flush
 * and the other is to wait for all the io.  When we wait, we have an
 * extra check to make sure the ordered operation list really is empty
 * before we return
 */
528
void btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564
{
	struct btrfs_inode *btrfs_inode;
	struct inode *inode;
	struct list_head splice;

	INIT_LIST_HEAD(&splice);

	mutex_lock(&root->fs_info->ordered_operations_mutex);
	spin_lock(&root->fs_info->ordered_extent_lock);
again:
	list_splice_init(&root->fs_info->ordered_operations, &splice);

	while (!list_empty(&splice)) {
		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
				   ordered_operations);

		inode = &btrfs_inode->vfs_inode;

		list_del_init(&btrfs_inode->ordered_operations);

		/*
		 * the inode may be getting freed (in sys_unlink path).
		 */
		inode = igrab(inode);

		if (!wait && inode) {
			list_add_tail(&BTRFS_I(inode)->ordered_operations,
			      &root->fs_info->ordered_operations);
		}
		spin_unlock(&root->fs_info->ordered_extent_lock);

		if (inode) {
			if (wait)
				btrfs_wait_ordered_range(inode, 0, (u64)-1);
			else
				filemap_flush(inode->i_mapping);
Y
Yan, Zheng 已提交
565
			btrfs_add_delayed_iput(inode);
566 567 568 569 570 571 572 573 574 575 576 577
		}

		cond_resched();
		spin_lock(&root->fs_info->ordered_extent_lock);
	}
	if (wait && !list_empty(&root->fs_info->ordered_operations))
		goto again;

	spin_unlock(&root->fs_info->ordered_extent_lock);
	mutex_unlock(&root->fs_info->ordered_operations_mutex);
}

578 579 580 581 582 583 584 585 586 587
/*
 * Used to start IO or wait for a given ordered extent to finish.
 *
 * If wait is one, this effectively waits on page writeback for all the pages
 * in the extent, and it waits on the io completion code to insert
 * metadata into the btree corresponding to the extent
 */
void btrfs_start_ordered_extent(struct inode *inode,
				       struct btrfs_ordered_extent *entry,
				       int wait)
588 589 590
{
	u64 start = entry->file_offset;
	u64 end = start + entry->len - 1;
591

592 593
	trace_btrfs_ordered_extent_start(inode, entry);

594 595 596 597 598
	/*
	 * pages in the range can be dirty, clean or writeback.  We
	 * start IO on any dirty ones so the wait doesn't stall waiting
	 * for pdflush to find them
	 */
599 600
	if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
		filemap_fdatawrite_range(inode->i_mapping, start, end);
C
Chris Mason 已提交
601
	if (wait) {
602 603
		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
						 &entry->flags));
C
Chris Mason 已提交
604
	}
605
}
606

607 608 609
/*
 * Used to wait on ordered extents across a large range of bytes.
 */
610
void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
611 612
{
	u64 end;
613
	u64 orig_end;
614
	struct btrfs_ordered_extent *ordered;
615
	int found;
616 617

	if (start + len < start) {
618
		orig_end = INT_LIMIT(loff_t);
619 620
	} else {
		orig_end = start + len - 1;
621 622
		if (orig_end > INT_LIMIT(loff_t))
			orig_end = INT_LIMIT(loff_t);
623
	}
624

625 626 627
	/* start IO across the range first to instantiate any delalloc
	 * extents
	 */
628
	filemap_write_and_wait_range(inode->i_mapping, start, orig_end);
629

630
	end = orig_end;
631
	found = 0;
C
Chris Mason 已提交
632
	while (1) {
633
		ordered = btrfs_lookup_first_ordered_extent(inode, end);
C
Chris Mason 已提交
634
		if (!ordered)
635
			break;
636
		if (ordered->file_offset > orig_end) {
637 638 639 640 641 642 643
			btrfs_put_ordered_extent(ordered);
			break;
		}
		if (ordered->file_offset + ordered->len < start) {
			btrfs_put_ordered_extent(ordered);
			break;
		}
644
		found++;
645
		btrfs_start_ordered_extent(inode, ordered, 1);
646 647
		end = ordered->file_offset;
		btrfs_put_ordered_extent(ordered);
648
		if (end == 0 || end == start)
649 650 651
			break;
		end--;
	}
652 653
}

654 655 656 657
/*
 * find an ordered extent corresponding to file_offset.  return NULL if
 * nothing is found, otherwise take a reference on the extent and return it
 */
658 659 660 661 662 663 664 665
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
							 u64 file_offset)
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;

	tree = &BTRFS_I(inode)->ordered_tree;
666
	spin_lock(&tree->lock);
667 668 669 670 671 672 673 674 675 676
	node = tree_search(tree, file_offset);
	if (!node)
		goto out;

	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
	if (!offset_in_entry(entry, file_offset))
		entry = NULL;
	if (entry)
		atomic_inc(&entry->refs);
out:
677
	spin_unlock(&tree->lock);
678 679 680
	return entry;
}

681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721
/* Since the DIO code tries to lock a wide area we need to look for any ordered
 * extents that exist in the range, rather than just the start of the range.
 */
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
							u64 file_offset,
							u64 len)
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;

	tree = &BTRFS_I(inode)->ordered_tree;
	spin_lock(&tree->lock);
	node = tree_search(tree, file_offset);
	if (!node) {
		node = tree_search(tree, file_offset + len);
		if (!node)
			goto out;
	}

	while (1) {
		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
		if (range_overlaps(entry, file_offset, len))
			break;

		if (entry->file_offset >= file_offset + len) {
			entry = NULL;
			break;
		}
		entry = NULL;
		node = rb_next(node);
		if (!node)
			break;
	}
out:
	if (entry)
		atomic_inc(&entry->refs);
	spin_unlock(&tree->lock);
	return entry;
}

722 723 724 725
/*
 * lookup and return any extent before 'file_offset'.  NULL is returned
 * if none is found
 */
726
struct btrfs_ordered_extent *
C
Chris Mason 已提交
727
btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
728 729 730 731 732 733
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;

	tree = &BTRFS_I(inode)->ordered_tree;
734
	spin_lock(&tree->lock);
735 736 737 738 739 740 741
	node = tree_search(tree, file_offset);
	if (!node)
		goto out;

	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
	atomic_inc(&entry->refs);
out:
742
	spin_unlock(&tree->lock);
743
	return entry;
744
}
745

746 747 748 749
/*
 * After an extent is done, call this to conditionally update the on disk
 * i_size.  i_size is updated to cover any fully written part of the file.
 */
750
int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
751 752 753 754 755 756 757
				struct btrfs_ordered_extent *ordered)
{
	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
	u64 disk_i_size;
	u64 new_i_size;
	u64 i_size_test;
758
	u64 i_size = i_size_read(inode);
759
	struct rb_node *node;
760
	struct rb_node *prev = NULL;
761
	struct btrfs_ordered_extent *test;
762 763 764 765
	int ret = 1;

	if (ordered)
		offset = entry_end(ordered);
766 767
	else
		offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
768

769
	spin_lock(&tree->lock);
770 771
	disk_i_size = BTRFS_I(inode)->disk_i_size;

772 773 774 775 776 777 778
	/* truncate file */
	if (disk_i_size > i_size) {
		BTRFS_I(inode)->disk_i_size = i_size;
		ret = 0;
		goto out;
	}

779 780 781 782
	/*
	 * if the disk i_size is already at the inode->i_size, or
	 * this ordered extent is inside the disk i_size, we're done
	 */
783
	if (disk_i_size == i_size || offset <= disk_i_size) {
784 785 786 787 788 789 790
		goto out;
	}

	/*
	 * we can't update the disk_isize if there are delalloc bytes
	 * between disk_i_size and  this ordered extent
	 */
791
	if (test_range_bit(io_tree, disk_i_size, offset - 1,
792
			   EXTENT_DELALLOC, 0, NULL)) {
793 794 795 796 797 798 799
		goto out;
	}
	/*
	 * walk backward from this ordered extent to disk_i_size.
	 * if we find an ordered extent then we can't update disk i_size
	 * yet
	 */
800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815
	if (ordered) {
		node = rb_prev(&ordered->rb_node);
	} else {
		prev = tree_search(tree, offset);
		/*
		 * we insert file extents without involving ordered struct,
		 * so there should be no ordered struct cover this offset
		 */
		if (prev) {
			test = rb_entry(prev, struct btrfs_ordered_extent,
					rb_node);
			BUG_ON(offset_in_entry(test, offset));
		}
		node = prev;
	}
	while (node) {
816 817 818
		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
		if (test->file_offset + test->len <= disk_i_size)
			break;
819
		if (test->file_offset >= i_size)
820 821 822
			break;
		if (test->file_offset >= disk_i_size)
			goto out;
823
		node = rb_prev(node);
824
	}
825
	new_i_size = min_t(u64, offset, i_size);
826 827 828 829 830 831 832

	/*
	 * at this point, we know we can safely update i_size to at least
	 * the offset from this ordered extent.  But, we need to
	 * walk forward and see if ios from higher up in the file have
	 * finished.
	 */
833 834 835 836 837 838 839 840
	if (ordered) {
		node = rb_next(&ordered->rb_node);
	} else {
		if (prev)
			node = rb_next(prev);
		else
			node = rb_first(&tree->tree);
	}
841 842 843 844 845 846 847
	i_size_test = 0;
	if (node) {
		/*
		 * do we have an area where IO might have finished
		 * between our ordered extent and the next one.
		 */
		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
848
		if (test->file_offset > offset)
Y
Yan Zheng 已提交
849
			i_size_test = test->file_offset;
850
	} else {
851
		i_size_test = i_size;
852 853 854 855 856 857 858 859
	}

	/*
	 * i_size_test is the end of a region after this ordered
	 * extent where there are no ordered extents.  As long as there
	 * are no delalloc bytes in this area, it is safe to update
	 * disk_i_size to the end of the region.
	 */
860 861 862 863
	if (i_size_test > offset &&
	    !test_range_bit(io_tree, offset, i_size_test - 1,
			    EXTENT_DELALLOC, 0, NULL)) {
		new_i_size = min_t(u64, i_size_test, i_size);
864 865
	}
	BTRFS_I(inode)->disk_i_size = new_i_size;
866
	ret = 0;
867
out:
868 869 870 871 872 873 874
	/*
	 * we need to remove the ordered extent with the tree lock held
	 * so that other people calling this function don't find our fully
	 * processed ordered entry and skip updating the i_size
	 */
	if (ordered)
		__btrfs_remove_ordered_extent(inode, ordered);
875
	spin_unlock(&tree->lock);
876 877 878
	if (ordered)
		wake_up(&ordered->wait);
	return ret;
879
}
880

881 882 883 884 885
/*
 * search the ordered extents for one corresponding to 'offset' and
 * try to find a checksum.  This is used because we allow pages to
 * be reclaimed before their checksum is actually put into the btree
 */
886 887
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
			   u32 *sum)
888 889 890 891 892
{
	struct btrfs_ordered_sum *ordered_sum;
	struct btrfs_sector_sum *sector_sums;
	struct btrfs_ordered_extent *ordered;
	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
893 894 895
	unsigned long num_sectors;
	unsigned long i;
	u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
896 897 898 899 900 901
	int ret = 1;

	ordered = btrfs_lookup_ordered_extent(inode, offset);
	if (!ordered)
		return 1;

902
	spin_lock(&tree->lock);
Q
Qinghuang Feng 已提交
903
	list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
904
		if (disk_bytenr >= ordered_sum->bytenr) {
905
			num_sectors = ordered_sum->len / sectorsize;
906
			sector_sums = ordered_sum->sums;
907
			for (i = 0; i < num_sectors; i++) {
908
				if (sector_sums[i].bytenr == disk_bytenr) {
909 910 911 912 913
					*sum = sector_sums[i].sum;
					ret = 0;
					goto out;
				}
			}
914 915 916
		}
	}
out:
917
	spin_unlock(&tree->lock);
918
	btrfs_put_ordered_extent(ordered);
919 920 921
	return ret;
}

922

923 924 925 926 927 928 929 930 931 932 933 934
/*
 * add a given inode to the list of inodes that must be fully on
 * disk before a transaction commit finishes.
 *
 * This basically gives us the ext3 style data=ordered mode, and it is mostly
 * used to make sure renamed files are fully on disk.
 *
 * It is a noop if the inode is already fully on disk.
 *
 * If trans is not null, we'll do a friendly check for a transaction that
 * is already flushing things and force the IO down ourselves.
 */
935 936
void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root, struct inode *inode)
937 938 939 940 941 942 943 944 945 946
{
	u64 last_mod;

	last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);

	/*
	 * if this file hasn't been changed since the last transaction
	 * commit, we can safely return without doing anything
	 */
	if (last_mod < root->fs_info->last_trans_committed)
947
		return;
948 949 950 951 952 953 954

	/*
	 * the transaction is already committing.  Just start the IO and
	 * don't bother with all of this list nonsense
	 */
	if (trans && root->fs_info->running_transaction->blocked) {
		btrfs_wait_ordered_range(inode, 0, (u64)-1);
955
		return;
956 957 958 959 960 961 962 963 964
	}

	spin_lock(&root->fs_info->ordered_extent_lock);
	if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
		list_add_tail(&BTRFS_I(inode)->ordered_operations,
			      &root->fs_info->ordered_operations);
	}
	spin_unlock(&root->fs_info->ordered_extent_lock);
}