“ef701d7b6f9e33cef11c823b140a0f93e150b27b”上不存在“hw/i386/xen/xen-hvm.c”
ordered-data.c 31.6 KB
Newer Older
C
Chris Mason 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

#include <linux/slab.h>
20
#include <linux/blkdev.h>
21 22
#include <linux/writeback.h>
#include <linux/pagevec.h>
C
Chris Mason 已提交
23 24 25
#include "ctree.h"
#include "transaction.h"
#include "btrfs_inode.h"
26
#include "extent_io.h"
27
#include "disk-io.h"
C
Chris Mason 已提交
28

29 30
static struct kmem_cache *btrfs_ordered_extent_cache;

31
static u64 entry_end(struct btrfs_ordered_extent *entry)
C
Chris Mason 已提交
32
{
33 34 35
	if (entry->file_offset + entry->len < entry->file_offset)
		return (u64)-1;
	return entry->file_offset + entry->len;
C
Chris Mason 已提交
36 37
}

C
Chris Mason 已提交
38 39 40
/* returns NULL if the insertion worked, or it returns the node it did find
 * in the tree
 */
41 42
static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
				   struct rb_node *node)
C
Chris Mason 已提交
43
{
C
Chris Mason 已提交
44 45
	struct rb_node **p = &root->rb_node;
	struct rb_node *parent = NULL;
46
	struct btrfs_ordered_extent *entry;
C
Chris Mason 已提交
47

C
Chris Mason 已提交
48
	while (*p) {
C
Chris Mason 已提交
49
		parent = *p;
50
		entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
C
Chris Mason 已提交
51

52
		if (file_offset < entry->file_offset)
C
Chris Mason 已提交
53
			p = &(*p)->rb_left;
54
		else if (file_offset >= entry_end(entry))
C
Chris Mason 已提交
55 56 57 58 59 60 61 62 63 64
			p = &(*p)->rb_right;
		else
			return parent;
	}

	rb_link_node(node, parent, p);
	rb_insert_color(node, root);
	return NULL;
}

65 66 67 68 69
static void ordered_data_tree_panic(struct inode *inode, int errno,
					       u64 offset)
{
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset "
70
		    "%llu\n", offset);
71 72
}

C
Chris Mason 已提交
73 74 75 76
/*
 * look for a given offset in the tree, and if it can't be found return the
 * first lesser offset
 */
77 78
static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
				     struct rb_node **prev_ret)
C
Chris Mason 已提交
79
{
C
Chris Mason 已提交
80
	struct rb_node *n = root->rb_node;
C
Chris Mason 已提交
81
	struct rb_node *prev = NULL;
82 83 84
	struct rb_node *test;
	struct btrfs_ordered_extent *entry;
	struct btrfs_ordered_extent *prev_entry = NULL;
C
Chris Mason 已提交
85

C
Chris Mason 已提交
86
	while (n) {
87
		entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
C
Chris Mason 已提交
88 89 90
		prev = n;
		prev_entry = entry;

91
		if (file_offset < entry->file_offset)
C
Chris Mason 已提交
92
			n = n->rb_left;
93
		else if (file_offset >= entry_end(entry))
C
Chris Mason 已提交
94 95 96 97 98 99 100
			n = n->rb_right;
		else
			return n;
	}
	if (!prev_ret)
		return NULL;

C
Chris Mason 已提交
101
	while (prev && file_offset >= entry_end(prev_entry)) {
102 103 104 105 106 107 108 109 110 111 112 113 114
		test = rb_next(prev);
		if (!test)
			break;
		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
				      rb_node);
		if (file_offset < entry_end(prev_entry))
			break;

		prev = test;
	}
	if (prev)
		prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
				      rb_node);
C
Chris Mason 已提交
115
	while (prev && file_offset < entry_end(prev_entry)) {
116 117 118 119 120 121
		test = rb_prev(prev);
		if (!test)
			break;
		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
				      rb_node);
		prev = test;
C
Chris Mason 已提交
122 123 124 125 126
	}
	*prev_ret = prev;
	return NULL;
}

C
Chris Mason 已提交
127 128 129
/*
 * helper to check if a given offset is inside a given entry
 */
130 131 132 133 134 135 136 137
static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
{
	if (file_offset < entry->file_offset ||
	    entry->file_offset + entry->len <= file_offset)
		return 0;
	return 1;
}

138 139 140 141 142 143 144 145 146
static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
			  u64 len)
{
	if (file_offset + len <= entry->file_offset ||
	    entry->file_offset + entry->len <= file_offset)
		return 0;
	return 1;
}

C
Chris Mason 已提交
147 148 149 150
/*
 * look find the first ordered struct that has this offset, otherwise
 * the first one less than this offset
 */
151 152
static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
					  u64 file_offset)
C
Chris Mason 已提交
153
{
154
	struct rb_root *root = &tree->tree;
155
	struct rb_node *prev = NULL;
C
Chris Mason 已提交
156
	struct rb_node *ret;
157 158 159 160 161 162 163 164 165
	struct btrfs_ordered_extent *entry;

	if (tree->last) {
		entry = rb_entry(tree->last, struct btrfs_ordered_extent,
				 rb_node);
		if (offset_in_entry(entry, file_offset))
			return tree->last;
	}
	ret = __tree_search(root, file_offset, &prev);
C
Chris Mason 已提交
166
	if (!ret)
167 168 169
		ret = prev;
	if (ret)
		tree->last = ret;
C
Chris Mason 已提交
170 171 172
	return ret;
}

173 174 175 176 177 178 179 180 181 182 183
/* allocate and add a new ordered_extent into the per-inode tree.
 * file_offset is the logical offset in the file
 *
 * start is the disk block number of an extent already reserved in the
 * extent allocation tree
 *
 * len is the length of the extent
 *
 * The tree is given a single reference on the ordered extent that was
 * inserted.
 */
184 185
static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
				      u64 start, u64 len, u64 disk_len,
186
				      int type, int dio, int compress_type)
C
Chris Mason 已提交
187
{
188
	struct btrfs_root *root = BTRFS_I(inode)->root;
C
Chris Mason 已提交
189
	struct btrfs_ordered_inode_tree *tree;
190 191
	struct rb_node *node;
	struct btrfs_ordered_extent *entry;
C
Chris Mason 已提交
192

193
	tree = &BTRFS_I(inode)->ordered_tree;
194
	entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
C
Chris Mason 已提交
195 196 197
	if (!entry)
		return -ENOMEM;

198 199 200
	entry->file_offset = file_offset;
	entry->start = start;
	entry->len = len;
201 202 203
	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) &&
	    !(type == BTRFS_ORDERED_NOCOW))
		entry->csum_bytes_left = disk_len;
C
Chris Mason 已提交
204
	entry->disk_len = disk_len;
205
	entry->bytes_left = len;
206
	entry->inode = igrab(inode);
207
	entry->compress_type = compress_type;
208
	entry->truncated_len = (u64)-1;
Y
Yan Zheng 已提交
209
	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
Y
Yan Zheng 已提交
210
		set_bit(type, &entry->flags);
211

212 213 214
	if (dio)
		set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);

215 216 217 218
	/* one ref for the tree */
	atomic_set(&entry->refs, 1);
	init_waitqueue_head(&entry->wait);
	INIT_LIST_HEAD(&entry->list);
219
	INIT_LIST_HEAD(&entry->root_extent_list);
220 221
	INIT_LIST_HEAD(&entry->work_list);
	init_completion(&entry->completion);
222
	INIT_LIST_HEAD(&entry->log_list);
C
Chris Mason 已提交
223

224 225
	trace_btrfs_ordered_extent_add(inode, entry);

226
	spin_lock_irq(&tree->lock);
227 228
	node = tree_insert(&tree->tree, file_offset,
			   &entry->rb_node);
229 230
	if (node)
		ordered_data_tree_panic(inode, -EEXIST, file_offset);
231
	spin_unlock_irq(&tree->lock);
C
Chris Mason 已提交
232

233
	spin_lock(&root->ordered_extent_lock);
234
	list_add_tail(&entry->root_extent_list,
235 236 237 238 239 240 241 242 243 244
		      &root->ordered_extents);
	root->nr_ordered_extents++;
	if (root->nr_ordered_extents == 1) {
		spin_lock(&root->fs_info->ordered_root_lock);
		BUG_ON(!list_empty(&root->ordered_root));
		list_add_tail(&root->ordered_root,
			      &root->fs_info->ordered_roots);
		spin_unlock(&root->fs_info->ordered_root_lock);
	}
	spin_unlock(&root->ordered_extent_lock);
245

C
Chris Mason 已提交
246 247 248
	return 0;
}

249 250 251 252
int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
			     u64 start, u64 len, u64 disk_len, int type)
{
	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
253 254
					  disk_len, type, 0,
					  BTRFS_COMPRESS_NONE);
255 256 257 258 259 260
}

int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
				 u64 start, u64 len, u64 disk_len, int type)
{
	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
261 262 263 264 265 266 267 268 269 270 271
					  disk_len, type, 1,
					  BTRFS_COMPRESS_NONE);
}

int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
				      u64 start, u64 len, u64 disk_len,
				      int type, int compress_type)
{
	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
					  disk_len, type, 0,
					  compress_type);
272 273
}

274 275
/*
 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
276 277
 * when an ordered extent is finished.  If the list covers more than one
 * ordered extent, it is split across multiples.
278
 */
279 280 281
void btrfs_add_ordered_sum(struct inode *inode,
			   struct btrfs_ordered_extent *entry,
			   struct btrfs_ordered_sum *sum)
C
Chris Mason 已提交
282
{
283
	struct btrfs_ordered_inode_tree *tree;
C
Chris Mason 已提交
284

285
	tree = &BTRFS_I(inode)->ordered_tree;
286
	spin_lock_irq(&tree->lock);
287
	list_add_tail(&sum->list, &entry->list);
288 289 290 291
	WARN_ON(entry->csum_bytes_left < sum->len);
	entry->csum_bytes_left -= sum->len;
	if (entry->csum_bytes_left == 0)
		wake_up(&entry->wait);
292
	spin_unlock_irq(&tree->lock);
C
Chris Mason 已提交
293 294
}

295 296 297 298 299 300 301 302 303 304 305 306 307 308
/*
 * this is used to account for finished IO across a given range
 * of the file.  The IO may span ordered extents.  If
 * a given ordered_extent is completely done, 1 is returned, otherwise
 * 0.
 *
 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
 * to make sure this function only returns 1 once for a given ordered extent.
 *
 * file_offset is updated to one byte past the range that is recorded as
 * complete.  This allows you to walk forward in the file.
 */
int btrfs_dec_test_first_ordered_pending(struct inode *inode,
				   struct btrfs_ordered_extent **cached,
309
				   u64 *file_offset, u64 io_size, int uptodate)
310 311 312 313 314
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;
	int ret;
315
	unsigned long flags;
316 317 318 319 320
	u64 dec_end;
	u64 dec_start;
	u64 to_dec;

	tree = &BTRFS_I(inode)->ordered_tree;
321
	spin_lock_irqsave(&tree->lock, flags);
322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338
	node = tree_search(tree, *file_offset);
	if (!node) {
		ret = 1;
		goto out;
	}

	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
	if (!offset_in_entry(entry, *file_offset)) {
		ret = 1;
		goto out;
	}

	dec_start = max(*file_offset, entry->file_offset);
	dec_end = min(*file_offset + io_size, entry->file_offset +
		      entry->len);
	*file_offset = dec_end;
	if (dec_start > dec_end) {
339 340
		btrfs_crit(BTRFS_I(inode)->root->fs_info,
			"bad ordering dec_start %llu end %llu", dec_start, dec_end);
341 342 343
	}
	to_dec = dec_end - dec_start;
	if (to_dec > entry->bytes_left) {
344 345 346
		btrfs_crit(BTRFS_I(inode)->root->fs_info,
			"bad ordered accounting left %llu size %llu",
			entry->bytes_left, to_dec);
347 348
	}
	entry->bytes_left -= to_dec;
349 350 351
	if (!uptodate)
		set_bit(BTRFS_ORDERED_IOERR, &entry->flags);

352 353 354 355 356 357 358 359 360
	if (entry->bytes_left == 0)
		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
	else
		ret = 1;
out:
	if (!ret && cached && entry) {
		*cached = entry;
		atomic_inc(&entry->refs);
	}
361
	spin_unlock_irqrestore(&tree->lock, flags);
362 363 364
	return ret == 0;
}

365 366 367 368 369 370 371 372 373
/*
 * this is used to account for finished IO across a given range
 * of the file.  The IO should not span ordered extents.  If
 * a given ordered_extent is completely done, 1 is returned, otherwise
 * 0.
 *
 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
 * to make sure this function only returns 1 once for a given ordered extent.
 */
374
int btrfs_dec_test_ordered_pending(struct inode *inode,
375
				   struct btrfs_ordered_extent **cached,
376
				   u64 file_offset, u64 io_size, int uptodate)
C
Chris Mason 已提交
377
{
378
	struct btrfs_ordered_inode_tree *tree;
C
Chris Mason 已提交
379
	struct rb_node *node;
380
	struct btrfs_ordered_extent *entry = NULL;
381
	unsigned long flags;
382 383 384
	int ret;

	tree = &BTRFS_I(inode)->ordered_tree;
385 386 387 388 389 390
	spin_lock_irqsave(&tree->lock, flags);
	if (cached && *cached) {
		entry = *cached;
		goto have_entry;
	}

391
	node = tree_search(tree, file_offset);
C
Chris Mason 已提交
392
	if (!node) {
393 394
		ret = 1;
		goto out;
C
Chris Mason 已提交
395 396
	}

397
	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
398
have_entry:
399 400 401
	if (!offset_in_entry(entry, file_offset)) {
		ret = 1;
		goto out;
C
Chris Mason 已提交
402
	}
403

404
	if (io_size > entry->bytes_left) {
405 406
		btrfs_crit(BTRFS_I(inode)->root->fs_info,
			   "bad ordered accounting left %llu size %llu",
407
		       entry->bytes_left, io_size);
408 409
	}
	entry->bytes_left -= io_size;
410 411 412
	if (!uptodate)
		set_bit(BTRFS_ORDERED_IOERR, &entry->flags);

413
	if (entry->bytes_left == 0)
414
		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
415 416
	else
		ret = 1;
417
out:
418 419 420 421
	if (!ret && cached && entry) {
		*cached = entry;
		atomic_inc(&entry->refs);
	}
422
	spin_unlock_irqrestore(&tree->lock, flags);
423 424
	return ret == 0;
}
C
Chris Mason 已提交
425

426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485
/* Needs to either be called under a log transaction or the log_mutex */
void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode)
{
	struct btrfs_ordered_inode_tree *tree;
	struct btrfs_ordered_extent *ordered;
	struct rb_node *n;
	int index = log->log_transid % 2;

	tree = &BTRFS_I(inode)->ordered_tree;
	spin_lock_irq(&tree->lock);
	for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
		ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
		spin_lock(&log->log_extents_lock[index]);
		if (list_empty(&ordered->log_list)) {
			list_add_tail(&ordered->log_list, &log->logged_list[index]);
			atomic_inc(&ordered->refs);
		}
		spin_unlock(&log->log_extents_lock[index]);
	}
	spin_unlock_irq(&tree->lock);
}

void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
{
	struct btrfs_ordered_extent *ordered;
	int index = transid % 2;

	spin_lock_irq(&log->log_extents_lock[index]);
	while (!list_empty(&log->logged_list[index])) {
		ordered = list_first_entry(&log->logged_list[index],
					   struct btrfs_ordered_extent,
					   log_list);
		list_del_init(&ordered->log_list);
		spin_unlock_irq(&log->log_extents_lock[index]);
		wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
						   &ordered->flags));
		btrfs_put_ordered_extent(ordered);
		spin_lock_irq(&log->log_extents_lock[index]);
	}
	spin_unlock_irq(&log->log_extents_lock[index]);
}

void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid)
{
	struct btrfs_ordered_extent *ordered;
	int index = transid % 2;

	spin_lock_irq(&log->log_extents_lock[index]);
	while (!list_empty(&log->logged_list[index])) {
		ordered = list_first_entry(&log->logged_list[index],
					   struct btrfs_ordered_extent,
					   log_list);
		list_del_init(&ordered->log_list);
		spin_unlock_irq(&log->log_extents_lock[index]);
		btrfs_put_ordered_extent(ordered);
		spin_lock_irq(&log->log_extents_lock[index]);
	}
	spin_unlock_irq(&log->log_extents_lock[index]);
}

486 487 488 489
/*
 * used to drop a reference on an ordered extent.  This will free
 * the extent if the last reference is dropped
 */
490
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
491
{
492 493 494
	struct list_head *cur;
	struct btrfs_ordered_sum *sum;

495 496
	trace_btrfs_ordered_extent_put(entry->inode, entry);

497
	if (atomic_dec_and_test(&entry->refs)) {
498 499
		if (entry->inode)
			btrfs_add_delayed_iput(entry->inode);
C
Chris Mason 已提交
500
		while (!list_empty(&entry->list)) {
501 502 503 504 505
			cur = entry->list.next;
			sum = list_entry(cur, struct btrfs_ordered_sum, list);
			list_del(&sum->list);
			kfree(sum);
		}
506
		kmem_cache_free(btrfs_ordered_extent_cache, entry);
507
	}
C
Chris Mason 已提交
508
}
509

510 511
/*
 * remove an ordered extent from the tree.  No references are dropped
512
 * and waiters are woken up.
513
 */
514 515
void btrfs_remove_ordered_extent(struct inode *inode,
				 struct btrfs_ordered_extent *entry)
516
{
517
	struct btrfs_ordered_inode_tree *tree;
518
	struct btrfs_root *root = BTRFS_I(inode)->root;
519 520
	struct rb_node *node;

521
	tree = &BTRFS_I(inode)->ordered_tree;
522
	spin_lock_irq(&tree->lock);
523
	node = &entry->rb_node;
524
	rb_erase(node, &tree->tree);
525 526
	if (tree->last == node)
		tree->last = NULL;
527
	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
528
	spin_unlock_irq(&tree->lock);
529

530
	spin_lock(&root->ordered_extent_lock);
531
	list_del_init(&entry->root_extent_list);
532
	root->nr_ordered_extents--;
533

534 535
	trace_btrfs_ordered_extent_remove(inode, entry);

536 537 538 539 540 541 542
	/*
	 * we have no more ordered extents for this inode and
	 * no dirty pages.  We can safely remove it from the
	 * list of ordered extents
	 */
	if (RB_EMPTY_ROOT(&tree->tree) &&
	    !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
543
		spin_lock(&root->fs_info->ordered_root_lock);
544
		list_del_init(&BTRFS_I(inode)->ordered_operations);
545
		spin_unlock(&root->fs_info->ordered_root_lock);
546
	}
547 548 549 550 551 552 553 554

	if (!root->nr_ordered_extents) {
		spin_lock(&root->fs_info->ordered_root_lock);
		BUG_ON(list_empty(&root->ordered_root));
		list_del_init(&root->ordered_root);
		spin_unlock(&root->fs_info->ordered_root_lock);
	}
	spin_unlock(&root->ordered_extent_lock);
555
	wake_up(&entry->wait);
556 557
}

558 559 560 561 562 563 564 565 566
static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
{
	struct btrfs_ordered_extent *ordered;

	ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
	btrfs_start_ordered_extent(ordered->inode, ordered, 1);
	complete(&ordered->completion);
}

C
Chris Mason 已提交
567 568 569 570
/*
 * wait for all the ordered extents in a root.  This is done when balancing
 * space between drives.
 */
571
int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
572
{
573 574
	struct list_head splice, works;
	struct btrfs_ordered_extent *ordered, *next;
575
	int count = 0;
576 577

	INIT_LIST_HEAD(&splice);
578
	INIT_LIST_HEAD(&works);
579

580
	mutex_lock(&root->fs_info->ordered_operations_mutex);
581 582
	spin_lock(&root->ordered_extent_lock);
	list_splice_init(&root->ordered_extents, &splice);
583
	while (!list_empty(&splice) && nr) {
584 585 586 587 588 589
		ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
					   root_extent_list);
		list_move_tail(&ordered->root_extent_list,
			       &root->ordered_extents);
		atomic_inc(&ordered->refs);
		spin_unlock(&root->ordered_extent_lock);
590

591 592 593 594
		ordered->flush_work.func = btrfs_run_ordered_extent_work;
		list_add_tail(&ordered->work_list, &works);
		btrfs_queue_worker(&root->fs_info->flush_workers,
				   &ordered->flush_work);
595

596
		cond_resched();
597
		spin_lock(&root->ordered_extent_lock);
598 599 600
		if (nr != -1)
			nr--;
		count++;
601
	}
602
	list_splice_tail(&splice, &root->ordered_extents);
603
	spin_unlock(&root->ordered_extent_lock);
604 605 606 607 608 609 610

	list_for_each_entry_safe(ordered, next, &works, work_list) {
		list_del_init(&ordered->work_list);
		wait_for_completion(&ordered->completion);
		btrfs_put_ordered_extent(ordered);
		cond_resched();
	}
611
	mutex_unlock(&root->fs_info->ordered_operations_mutex);
612 613

	return count;
614 615
}

616
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
617 618 619
{
	struct btrfs_root *root;
	struct list_head splice;
620
	int done;
621 622 623 624 625

	INIT_LIST_HEAD(&splice);

	spin_lock(&fs_info->ordered_root_lock);
	list_splice_init(&fs_info->ordered_roots, &splice);
626
	while (!list_empty(&splice) && nr) {
627 628 629 630 631 632 633 634
		root = list_first_entry(&splice, struct btrfs_root,
					ordered_root);
		root = btrfs_grab_fs_root(root);
		BUG_ON(!root);
		list_move_tail(&root->ordered_root,
			       &fs_info->ordered_roots);
		spin_unlock(&fs_info->ordered_root_lock);

635
		done = btrfs_wait_ordered_extents(root, nr);
636 637 638
		btrfs_put_fs_root(root);

		spin_lock(&fs_info->ordered_root_lock);
639 640 641 642
		if (nr != -1) {
			nr -= done;
			WARN_ON(nr < 0);
		}
643
	}
644
	list_splice_tail(&splice, &fs_info->ordered_roots);
645 646 647
	spin_unlock(&fs_info->ordered_root_lock);
}

648 649 650 651 652 653 654 655 656 657
/*
 * this is used during transaction commit to write all the inodes
 * added to the ordered operation list.  These files must be fully on
 * disk before the transaction commits.
 *
 * we have two modes here, one is to just start the IO via filemap_flush
 * and the other is to wait for all the io.  When we wait, we have an
 * extra check to make sure the ordered operation list really is empty
 * before we return
 */
658 659
int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root, int wait)
660 661 662
{
	struct btrfs_inode *btrfs_inode;
	struct inode *inode;
663
	struct btrfs_transaction *cur_trans = trans->transaction;
664
	struct list_head splice;
665 666 667
	struct list_head works;
	struct btrfs_delalloc_work *work, *next;
	int ret = 0;
668 669

	INIT_LIST_HEAD(&splice);
670
	INIT_LIST_HEAD(&works);
671

672
	mutex_lock(&root->fs_info->ordered_extent_flush_mutex);
673
	spin_lock(&root->fs_info->ordered_root_lock);
674
	list_splice_init(&cur_trans->ordered_operations, &splice);
675 676 677 678 679 680 681 682 683 684 685
	while (!list_empty(&splice)) {
		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
				   ordered_operations);
		inode = &btrfs_inode->vfs_inode;

		list_del_init(&btrfs_inode->ordered_operations);

		/*
		 * the inode may be getting freed (in sys_unlink path).
		 */
		inode = igrab(inode);
686 687
		if (!inode)
			continue;
688 689 690

		if (!wait)
			list_add_tail(&BTRFS_I(inode)->ordered_operations,
691
				      &cur_trans->ordered_operations);
692
		spin_unlock(&root->fs_info->ordered_root_lock);
693

694 695
		work = btrfs_alloc_delalloc_work(inode, wait, 1);
		if (!work) {
696
			spin_lock(&root->fs_info->ordered_root_lock);
697 698 699 700
			if (list_empty(&BTRFS_I(inode)->ordered_operations))
				list_add_tail(&btrfs_inode->ordered_operations,
					      &splice);
			list_splice_tail(&splice,
701
					 &cur_trans->ordered_operations);
702
			spin_unlock(&root->fs_info->ordered_root_lock);
703 704
			ret = -ENOMEM;
			goto out;
705
		}
706 707 708
		list_add_tail(&work->list, &works);
		btrfs_queue_worker(&root->fs_info->flush_workers,
				   &work->work);
709 710

		cond_resched();
711
		spin_lock(&root->fs_info->ordered_root_lock);
712
	}
713
	spin_unlock(&root->fs_info->ordered_root_lock);
714 715 716 717 718
out:
	list_for_each_entry_safe(work, next, &works, list) {
		list_del_init(&work->list);
		btrfs_wait_and_free_delalloc_work(work);
	}
719
	mutex_unlock(&root->fs_info->ordered_extent_flush_mutex);
720
	return ret;
721 722
}

723 724 725 726 727 728 729 730 731 732
/*
 * Used to start IO or wait for a given ordered extent to finish.
 *
 * If wait is one, this effectively waits on page writeback for all the pages
 * in the extent, and it waits on the io completion code to insert
 * metadata into the btree corresponding to the extent
 */
void btrfs_start_ordered_extent(struct inode *inode,
				       struct btrfs_ordered_extent *entry,
				       int wait)
733 734 735
{
	u64 start = entry->file_offset;
	u64 end = start + entry->len - 1;
736

737 738
	trace_btrfs_ordered_extent_start(inode, entry);

739 740 741
	/*
	 * pages in the range can be dirty, clean or writeback.  We
	 * start IO on any dirty ones so the wait doesn't stall waiting
742
	 * for the flusher thread to find them
743
	 */
744 745
	if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
		filemap_fdatawrite_range(inode->i_mapping, start, end);
C
Chris Mason 已提交
746
	if (wait) {
747 748
		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
						 &entry->flags));
C
Chris Mason 已提交
749
	}
750
}
751

752 753 754
/*
 * Used to wait on ordered extents across a large range of bytes.
 */
755
int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
756
{
757
	int ret = 0;
758
	u64 end;
759
	u64 orig_end;
760
	struct btrfs_ordered_extent *ordered;
761 762

	if (start + len < start) {
763
		orig_end = INT_LIMIT(loff_t);
764 765
	} else {
		orig_end = start + len - 1;
766 767
		if (orig_end > INT_LIMIT(loff_t))
			orig_end = INT_LIMIT(loff_t);
768
	}
769

770 771 772
	/* start IO across the range first to instantiate any delalloc
	 * extents
	 */
773 774 775
	ret = filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
	if (ret)
		return ret;
776 777 778 779 780 781 782 783 784 785 786 787 788 789 790
	/*
	 * So with compression we will find and lock a dirty page and clear the
	 * first one as dirty, setup an async extent, and immediately return
	 * with the entire range locked but with nobody actually marked with
	 * writeback.  So we can't just filemap_write_and_wait_range() and
	 * expect it to work since it will just kick off a thread to do the
	 * actual work.  So we need to call filemap_fdatawrite_range _again_
	 * since it will wait on the page lock, which won't be unlocked until
	 * after the pages have been marked as writeback and so we're good to go
	 * from there.  We have to do this otherwise we'll miss the ordered
	 * extents and that results in badness.  Please Josef, do not think you
	 * know better and pull this out at some point in the future, it is
	 * right and you are wrong.
	 */
	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
791 792 793 794 795 796 797 798 799
		     &BTRFS_I(inode)->runtime_flags)) {
		ret = filemap_fdatawrite_range(inode->i_mapping, start,
					       orig_end);
		if (ret)
			return ret;
	}
	ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
	if (ret)
		return ret;
800

801
	end = orig_end;
C
Chris Mason 已提交
802
	while (1) {
803
		ordered = btrfs_lookup_first_ordered_extent(inode, end);
C
Chris Mason 已提交
804
		if (!ordered)
805
			break;
806
		if (ordered->file_offset > orig_end) {
807 808 809
			btrfs_put_ordered_extent(ordered);
			break;
		}
810
		if (ordered->file_offset + ordered->len <= start) {
811 812 813
			btrfs_put_ordered_extent(ordered);
			break;
		}
814
		btrfs_start_ordered_extent(inode, ordered, 1);
815
		end = ordered->file_offset;
816 817
		if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
			ret = -EIO;
818
		btrfs_put_ordered_extent(ordered);
819
		if (ret || end == 0 || end == start)
820 821 822
			break;
		end--;
	}
823
	return ret;
824 825
}

826 827 828 829
/*
 * find an ordered extent corresponding to file_offset.  return NULL if
 * nothing is found, otherwise take a reference on the extent and return it
 */
830 831 832 833 834 835 836 837
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
							 u64 file_offset)
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;

	tree = &BTRFS_I(inode)->ordered_tree;
838
	spin_lock_irq(&tree->lock);
839 840 841 842 843 844 845 846 847 848
	node = tree_search(tree, file_offset);
	if (!node)
		goto out;

	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
	if (!offset_in_entry(entry, file_offset))
		entry = NULL;
	if (entry)
		atomic_inc(&entry->refs);
out:
849
	spin_unlock_irq(&tree->lock);
850 851 852
	return entry;
}

853 854 855 856 857 858 859 860 861 862 863 864
/* Since the DIO code tries to lock a wide area we need to look for any ordered
 * extents that exist in the range, rather than just the start of the range.
 */
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
							u64 file_offset,
							u64 len)
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;

	tree = &BTRFS_I(inode)->ordered_tree;
865
	spin_lock_irq(&tree->lock);
866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889
	node = tree_search(tree, file_offset);
	if (!node) {
		node = tree_search(tree, file_offset + len);
		if (!node)
			goto out;
	}

	while (1) {
		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
		if (range_overlaps(entry, file_offset, len))
			break;

		if (entry->file_offset >= file_offset + len) {
			entry = NULL;
			break;
		}
		entry = NULL;
		node = rb_next(node);
		if (!node)
			break;
	}
out:
	if (entry)
		atomic_inc(&entry->refs);
890
	spin_unlock_irq(&tree->lock);
891 892 893
	return entry;
}

894 895 896 897
/*
 * lookup and return any extent before 'file_offset'.  NULL is returned
 * if none is found
 */
898
struct btrfs_ordered_extent *
C
Chris Mason 已提交
899
btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
900 901 902 903 904 905
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;

	tree = &BTRFS_I(inode)->ordered_tree;
906
	spin_lock_irq(&tree->lock);
907 908 909 910 911 912 913
	node = tree_search(tree, file_offset);
	if (!node)
		goto out;

	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
	atomic_inc(&entry->refs);
out:
914
	spin_unlock_irq(&tree->lock);
915
	return entry;
916
}
917

918 919 920 921
/*
 * After an extent is done, call this to conditionally update the on disk
 * i_size.  i_size is updated to cover any fully written part of the file.
 */
922
int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
923 924 925 926 927
				struct btrfs_ordered_extent *ordered)
{
	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
	u64 disk_i_size;
	u64 new_i_size;
928
	u64 i_size = i_size_read(inode);
929
	struct rb_node *node;
930
	struct rb_node *prev = NULL;
931
	struct btrfs_ordered_extent *test;
932 933
	int ret = 1;

934 935
	spin_lock_irq(&tree->lock);
	if (ordered) {
936
		offset = entry_end(ordered);
937 938 939 940 941
		if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags))
			offset = min(offset,
				     ordered->file_offset +
				     ordered->truncated_len);
	} else {
942
		offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
943
	}
944 945
	disk_i_size = BTRFS_I(inode)->disk_i_size;

946 947 948 949 950 951 952
	/* truncate file */
	if (disk_i_size > i_size) {
		BTRFS_I(inode)->disk_i_size = i_size;
		ret = 0;
		goto out;
	}

953 954 955 956
	/*
	 * if the disk i_size is already at the inode->i_size, or
	 * this ordered extent is inside the disk i_size, we're done
	 */
J
Josef Bacik 已提交
957 958 959 960 961 962 963 964 965
	if (disk_i_size == i_size)
		goto out;

	/*
	 * We still need to update disk_i_size if outstanding_isize is greater
	 * than disk_i_size.
	 */
	if (offset <= disk_i_size &&
	    (!ordered || ordered->outstanding_isize <= disk_i_size))
966 967 968 969 970 971 972
		goto out;

	/*
	 * walk backward from this ordered extent to disk_i_size.
	 * if we find an ordered extent then we can't update disk i_size
	 * yet
	 */
973 974 975 976 977 978 979 980 981 982 983 984 985 986 987
	if (ordered) {
		node = rb_prev(&ordered->rb_node);
	} else {
		prev = tree_search(tree, offset);
		/*
		 * we insert file extents without involving ordered struct,
		 * so there should be no ordered struct cover this offset
		 */
		if (prev) {
			test = rb_entry(prev, struct btrfs_ordered_extent,
					rb_node);
			BUG_ON(offset_in_entry(test, offset));
		}
		node = prev;
	}
988
	for (; node; node = rb_prev(node)) {
989
		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
990 991 992 993

		/* We treat this entry as if it doesnt exist */
		if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
			continue;
994 995
		if (test->file_offset + test->len <= disk_i_size)
			break;
996
		if (test->file_offset >= i_size)
997
			break;
998
		if (entry_end(test) > disk_i_size) {
999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010
			/*
			 * we don't update disk_i_size now, so record this
			 * undealt i_size. Or we will not know the real
			 * i_size.
			 */
			if (test->outstanding_isize < offset)
				test->outstanding_isize = offset;
			if (ordered &&
			    ordered->outstanding_isize >
			    test->outstanding_isize)
				test->outstanding_isize =
						ordered->outstanding_isize;
1011
			goto out;
1012
		}
1013
	}
1014
	new_i_size = min_t(u64, offset, i_size);
1015 1016

	/*
1017 1018
	 * Some ordered extents may completed before the current one, and
	 * we hold the real i_size in ->outstanding_isize.
1019
	 */
1020 1021
	if (ordered && ordered->outstanding_isize > new_i_size)
		new_i_size = min_t(u64, ordered->outstanding_isize, i_size);
1022
	BTRFS_I(inode)->disk_i_size = new_i_size;
1023
	ret = 0;
1024
out:
1025
	/*
1026 1027 1028 1029 1030
	 * We need to do this because we can't remove ordered extents until
	 * after the i_disk_size has been updated and then the inode has been
	 * updated to reflect the change, so we need to tell anybody who finds
	 * this ordered extent that we've already done all the real work, we
	 * just haven't completed all the other work.
1031 1032
	 */
	if (ordered)
1033 1034
		set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags);
	spin_unlock_irq(&tree->lock);
1035
	return ret;
1036
}
1037

1038 1039 1040 1041 1042
/*
 * search the ordered extents for one corresponding to 'offset' and
 * try to find a checksum.  This is used because we allow pages to
 * be reclaimed before their checksum is actually put into the btree
 */
1043
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
1044
			   u32 *sum, int len)
1045 1046 1047 1048
{
	struct btrfs_ordered_sum *ordered_sum;
	struct btrfs_ordered_extent *ordered;
	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
1049 1050 1051
	unsigned long num_sectors;
	unsigned long i;
	u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
1052
	int index = 0;
1053 1054 1055

	ordered = btrfs_lookup_ordered_extent(inode, offset);
	if (!ordered)
1056
		return 0;
1057

1058
	spin_lock_irq(&tree->lock);
Q
Qinghuang Feng 已提交
1059
	list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
1060 1061 1062 1063 1064 1065
		if (disk_bytenr >= ordered_sum->bytenr &&
		    disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
			i = (disk_bytenr - ordered_sum->bytenr) >>
			    inode->i_sb->s_blocksize_bits;
			num_sectors = ordered_sum->len >>
				      inode->i_sb->s_blocksize_bits;
1066 1067 1068 1069 1070 1071 1072 1073
			num_sectors = min_t(int, len - index, num_sectors - i);
			memcpy(sum + index, ordered_sum->sums + i,
			       num_sectors);

			index += (int)num_sectors;
			if (index == len)
				goto out;
			disk_bytenr += num_sectors * sectorsize;
1074 1075 1076
		}
	}
out:
1077
	spin_unlock_irq(&tree->lock);
1078
	btrfs_put_ordered_extent(ordered);
1079
	return index;
1080 1081
}

1082

1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094
/*
 * add a given inode to the list of inodes that must be fully on
 * disk before a transaction commit finishes.
 *
 * This basically gives us the ext3 style data=ordered mode, and it is mostly
 * used to make sure renamed files are fully on disk.
 *
 * It is a noop if the inode is already fully on disk.
 *
 * If trans is not null, we'll do a friendly check for a transaction that
 * is already flushing things and force the IO down ourselves.
 */
1095 1096
void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root, struct inode *inode)
1097
{
1098
	struct btrfs_transaction *cur_trans = trans->transaction;
1099 1100 1101 1102 1103 1104 1105 1106
	u64 last_mod;

	last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);

	/*
	 * if this file hasn't been changed since the last transaction
	 * commit, we can safely return without doing anything
	 */
1107
	if (last_mod <= root->fs_info->last_trans_committed)
1108
		return;
1109

1110
	spin_lock(&root->fs_info->ordered_root_lock);
1111 1112
	if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
		list_add_tail(&BTRFS_I(inode)->ordered_operations,
1113
			      &cur_trans->ordered_operations);
1114
	}
1115
	spin_unlock(&root->fs_info->ordered_root_lock);
1116
}
1117 1118 1119 1120 1121 1122 1123 1124 1125

int __init ordered_data_init(void)
{
	btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
				     sizeof(struct btrfs_ordered_extent), 0,
				     SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
				     NULL);
	if (!btrfs_ordered_extent_cache)
		return -ENOMEM;
1126

1127 1128 1129 1130 1131 1132 1133 1134
	return 0;
}

void ordered_data_exit(void)
{
	if (btrfs_ordered_extent_cache)
		kmem_cache_destroy(btrfs_ordered_extent_cache);
}