ordered-data.c 31.3 KB
Newer Older
C
Chris Mason 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

#include <linux/slab.h>
20
#include <linux/blkdev.h>
21 22
#include <linux/writeback.h>
#include <linux/pagevec.h>
C
Chris Mason 已提交
23 24 25
#include "ctree.h"
#include "transaction.h"
#include "btrfs_inode.h"
26
#include "extent_io.h"
27
#include "disk-io.h"
C
Chris Mason 已提交
28

29 30
static struct kmem_cache *btrfs_ordered_extent_cache;

31
static u64 entry_end(struct btrfs_ordered_extent *entry)
C
Chris Mason 已提交
32
{
33 34 35
	if (entry->file_offset + entry->len < entry->file_offset)
		return (u64)-1;
	return entry->file_offset + entry->len;
C
Chris Mason 已提交
36 37
}

C
Chris Mason 已提交
38 39 40
/* returns NULL if the insertion worked, or it returns the node it did find
 * in the tree
 */
41 42
static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
				   struct rb_node *node)
C
Chris Mason 已提交
43
{
C
Chris Mason 已提交
44 45
	struct rb_node **p = &root->rb_node;
	struct rb_node *parent = NULL;
46
	struct btrfs_ordered_extent *entry;
C
Chris Mason 已提交
47

C
Chris Mason 已提交
48
	while (*p) {
C
Chris Mason 已提交
49
		parent = *p;
50
		entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
C
Chris Mason 已提交
51

52
		if (file_offset < entry->file_offset)
C
Chris Mason 已提交
53
			p = &(*p)->rb_left;
54
		else if (file_offset >= entry_end(entry))
C
Chris Mason 已提交
55 56 57 58 59 60 61 62 63 64
			p = &(*p)->rb_right;
		else
			return parent;
	}

	rb_link_node(node, parent, p);
	rb_insert_color(node, root);
	return NULL;
}

65 66 67 68 69 70 71 72
static void ordered_data_tree_panic(struct inode *inode, int errno,
					       u64 offset)
{
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset "
		    "%llu\n", (unsigned long long)offset);
}

C
Chris Mason 已提交
73 74 75 76
/*
 * look for a given offset in the tree, and if it can't be found return the
 * first lesser offset
 */
77 78
static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
				     struct rb_node **prev_ret)
C
Chris Mason 已提交
79
{
C
Chris Mason 已提交
80
	struct rb_node *n = root->rb_node;
C
Chris Mason 已提交
81
	struct rb_node *prev = NULL;
82 83 84
	struct rb_node *test;
	struct btrfs_ordered_extent *entry;
	struct btrfs_ordered_extent *prev_entry = NULL;
C
Chris Mason 已提交
85

C
Chris Mason 已提交
86
	while (n) {
87
		entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
C
Chris Mason 已提交
88 89 90
		prev = n;
		prev_entry = entry;

91
		if (file_offset < entry->file_offset)
C
Chris Mason 已提交
92
			n = n->rb_left;
93
		else if (file_offset >= entry_end(entry))
C
Chris Mason 已提交
94 95 96 97 98 99 100
			n = n->rb_right;
		else
			return n;
	}
	if (!prev_ret)
		return NULL;

C
Chris Mason 已提交
101
	while (prev && file_offset >= entry_end(prev_entry)) {
102 103 104 105 106 107 108 109 110 111 112 113 114
		test = rb_next(prev);
		if (!test)
			break;
		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
				      rb_node);
		if (file_offset < entry_end(prev_entry))
			break;

		prev = test;
	}
	if (prev)
		prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
				      rb_node);
C
Chris Mason 已提交
115
	while (prev && file_offset < entry_end(prev_entry)) {
116 117 118 119 120 121
		test = rb_prev(prev);
		if (!test)
			break;
		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
				      rb_node);
		prev = test;
C
Chris Mason 已提交
122 123 124 125 126
	}
	*prev_ret = prev;
	return NULL;
}

C
Chris Mason 已提交
127 128 129
/*
 * helper to check if a given offset is inside a given entry
 */
130 131 132 133 134 135 136 137
static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
{
	if (file_offset < entry->file_offset ||
	    entry->file_offset + entry->len <= file_offset)
		return 0;
	return 1;
}

138 139 140 141 142 143 144 145 146
static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
			  u64 len)
{
	if (file_offset + len <= entry->file_offset ||
	    entry->file_offset + entry->len <= file_offset)
		return 0;
	return 1;
}

C
Chris Mason 已提交
147 148 149 150
/*
 * look find the first ordered struct that has this offset, otherwise
 * the first one less than this offset
 */
151 152
static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
					  u64 file_offset)
C
Chris Mason 已提交
153
{
154
	struct rb_root *root = &tree->tree;
155
	struct rb_node *prev = NULL;
C
Chris Mason 已提交
156
	struct rb_node *ret;
157 158 159 160 161 162 163 164 165
	struct btrfs_ordered_extent *entry;

	if (tree->last) {
		entry = rb_entry(tree->last, struct btrfs_ordered_extent,
				 rb_node);
		if (offset_in_entry(entry, file_offset))
			return tree->last;
	}
	ret = __tree_search(root, file_offset, &prev);
C
Chris Mason 已提交
166
	if (!ret)
167 168 169
		ret = prev;
	if (ret)
		tree->last = ret;
C
Chris Mason 已提交
170 171 172
	return ret;
}

173 174 175 176 177 178 179 180 181 182 183
/* allocate and add a new ordered_extent into the per-inode tree.
 * file_offset is the logical offset in the file
 *
 * start is the disk block number of an extent already reserved in the
 * extent allocation tree
 *
 * len is the length of the extent
 *
 * The tree is given a single reference on the ordered extent that was
 * inserted.
 */
184 185
static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
				      u64 start, u64 len, u64 disk_len,
186
				      int type, int dio, int compress_type)
C
Chris Mason 已提交
187
{
188
	struct btrfs_root *root = BTRFS_I(inode)->root;
C
Chris Mason 已提交
189
	struct btrfs_ordered_inode_tree *tree;
190 191
	struct rb_node *node;
	struct btrfs_ordered_extent *entry;
C
Chris Mason 已提交
192

193
	tree = &BTRFS_I(inode)->ordered_tree;
194
	entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
C
Chris Mason 已提交
195 196 197
	if (!entry)
		return -ENOMEM;

198 199 200
	entry->file_offset = file_offset;
	entry->start = start;
	entry->len = len;
201 202 203
	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) &&
	    !(type == BTRFS_ORDERED_NOCOW))
		entry->csum_bytes_left = disk_len;
C
Chris Mason 已提交
204
	entry->disk_len = disk_len;
205
	entry->bytes_left = len;
206
	entry->inode = igrab(inode);
207
	entry->compress_type = compress_type;
Y
Yan Zheng 已提交
208
	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
Y
Yan Zheng 已提交
209
		set_bit(type, &entry->flags);
210

211 212 213
	if (dio)
		set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);

214 215 216 217
	/* one ref for the tree */
	atomic_set(&entry->refs, 1);
	init_waitqueue_head(&entry->wait);
	INIT_LIST_HEAD(&entry->list);
218
	INIT_LIST_HEAD(&entry->root_extent_list);
219 220
	INIT_LIST_HEAD(&entry->work_list);
	init_completion(&entry->completion);
221
	INIT_LIST_HEAD(&entry->log_list);
C
Chris Mason 已提交
222

223 224
	trace_btrfs_ordered_extent_add(inode, entry);

225
	spin_lock_irq(&tree->lock);
226 227
	node = tree_insert(&tree->tree, file_offset,
			   &entry->rb_node);
228 229
	if (node)
		ordered_data_tree_panic(inode, -EEXIST, file_offset);
230
	spin_unlock_irq(&tree->lock);
C
Chris Mason 已提交
231

232
	spin_lock(&root->ordered_extent_lock);
233
	list_add_tail(&entry->root_extent_list,
234 235 236 237 238 239 240 241 242 243
		      &root->ordered_extents);
	root->nr_ordered_extents++;
	if (root->nr_ordered_extents == 1) {
		spin_lock(&root->fs_info->ordered_root_lock);
		BUG_ON(!list_empty(&root->ordered_root));
		list_add_tail(&root->ordered_root,
			      &root->fs_info->ordered_roots);
		spin_unlock(&root->fs_info->ordered_root_lock);
	}
	spin_unlock(&root->ordered_extent_lock);
244

C
Chris Mason 已提交
245 246 247
	return 0;
}

248 249 250 251
int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
			     u64 start, u64 len, u64 disk_len, int type)
{
	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
252 253
					  disk_len, type, 0,
					  BTRFS_COMPRESS_NONE);
254 255 256 257 258 259
}

int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
				 u64 start, u64 len, u64 disk_len, int type)
{
	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
260 261 262 263 264 265 266 267 268 269 270
					  disk_len, type, 1,
					  BTRFS_COMPRESS_NONE);
}

int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
				      u64 start, u64 len, u64 disk_len,
				      int type, int compress_type)
{
	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
					  disk_len, type, 0,
					  compress_type);
271 272
}

273 274
/*
 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
275 276
 * when an ordered extent is finished.  If the list covers more than one
 * ordered extent, it is split across multiples.
277
 */
278 279 280
void btrfs_add_ordered_sum(struct inode *inode,
			   struct btrfs_ordered_extent *entry,
			   struct btrfs_ordered_sum *sum)
C
Chris Mason 已提交
281
{
282
	struct btrfs_ordered_inode_tree *tree;
C
Chris Mason 已提交
283

284
	tree = &BTRFS_I(inode)->ordered_tree;
285
	spin_lock_irq(&tree->lock);
286
	list_add_tail(&sum->list, &entry->list);
287 288 289 290
	WARN_ON(entry->csum_bytes_left < sum->len);
	entry->csum_bytes_left -= sum->len;
	if (entry->csum_bytes_left == 0)
		wake_up(&entry->wait);
291
	spin_unlock_irq(&tree->lock);
C
Chris Mason 已提交
292 293
}

294 295 296 297 298 299 300 301 302 303 304 305 306 307
/*
 * this is used to account for finished IO across a given range
 * of the file.  The IO may span ordered extents.  If
 * a given ordered_extent is completely done, 1 is returned, otherwise
 * 0.
 *
 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
 * to make sure this function only returns 1 once for a given ordered extent.
 *
 * file_offset is updated to one byte past the range that is recorded as
 * complete.  This allows you to walk forward in the file.
 */
int btrfs_dec_test_first_ordered_pending(struct inode *inode,
				   struct btrfs_ordered_extent **cached,
308
				   u64 *file_offset, u64 io_size, int uptodate)
309 310 311 312 313
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;
	int ret;
314
	unsigned long flags;
315 316 317 318 319
	u64 dec_end;
	u64 dec_start;
	u64 to_dec;

	tree = &BTRFS_I(inode)->ordered_tree;
320
	spin_lock_irqsave(&tree->lock, flags);
321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348
	node = tree_search(tree, *file_offset);
	if (!node) {
		ret = 1;
		goto out;
	}

	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
	if (!offset_in_entry(entry, *file_offset)) {
		ret = 1;
		goto out;
	}

	dec_start = max(*file_offset, entry->file_offset);
	dec_end = min(*file_offset + io_size, entry->file_offset +
		      entry->len);
	*file_offset = dec_end;
	if (dec_start > dec_end) {
		printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n",
		       (unsigned long long)dec_start,
		       (unsigned long long)dec_end);
	}
	to_dec = dec_end - dec_start;
	if (to_dec > entry->bytes_left) {
		printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
		       (unsigned long long)entry->bytes_left,
		       (unsigned long long)to_dec);
	}
	entry->bytes_left -= to_dec;
349 350 351
	if (!uptodate)
		set_bit(BTRFS_ORDERED_IOERR, &entry->flags);

352 353 354 355 356 357 358 359 360
	if (entry->bytes_left == 0)
		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
	else
		ret = 1;
out:
	if (!ret && cached && entry) {
		*cached = entry;
		atomic_inc(&entry->refs);
	}
361
	spin_unlock_irqrestore(&tree->lock, flags);
362 363 364
	return ret == 0;
}

365 366 367 368 369 370 371 372 373
/*
 * this is used to account for finished IO across a given range
 * of the file.  The IO should not span ordered extents.  If
 * a given ordered_extent is completely done, 1 is returned, otherwise
 * 0.
 *
 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
 * to make sure this function only returns 1 once for a given ordered extent.
 */
374
int btrfs_dec_test_ordered_pending(struct inode *inode,
375
				   struct btrfs_ordered_extent **cached,
376
				   u64 file_offset, u64 io_size, int uptodate)
C
Chris Mason 已提交
377
{
378
	struct btrfs_ordered_inode_tree *tree;
C
Chris Mason 已提交
379
	struct rb_node *node;
380
	struct btrfs_ordered_extent *entry = NULL;
381
	unsigned long flags;
382 383 384
	int ret;

	tree = &BTRFS_I(inode)->ordered_tree;
385 386 387 388 389 390
	spin_lock_irqsave(&tree->lock, flags);
	if (cached && *cached) {
		entry = *cached;
		goto have_entry;
	}

391
	node = tree_search(tree, file_offset);
C
Chris Mason 已提交
392
	if (!node) {
393 394
		ret = 1;
		goto out;
C
Chris Mason 已提交
395 396
	}

397
	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
398
have_entry:
399 400 401
	if (!offset_in_entry(entry, file_offset)) {
		ret = 1;
		goto out;
C
Chris Mason 已提交
402
	}
403

404 405 406 407 408 409
	if (io_size > entry->bytes_left) {
		printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
		       (unsigned long long)entry->bytes_left,
		       (unsigned long long)io_size);
	}
	entry->bytes_left -= io_size;
410 411 412
	if (!uptodate)
		set_bit(BTRFS_ORDERED_IOERR, &entry->flags);

413
	if (entry->bytes_left == 0)
414
		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
415 416
	else
		ret = 1;
417
out:
418 419 420 421
	if (!ret && cached && entry) {
		*cached = entry;
		atomic_inc(&entry->refs);
	}
422
	spin_unlock_irqrestore(&tree->lock, flags);
423 424
	return ret == 0;
}
C
Chris Mason 已提交
425

426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485
/* Needs to either be called under a log transaction or the log_mutex */
void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode)
{
	struct btrfs_ordered_inode_tree *tree;
	struct btrfs_ordered_extent *ordered;
	struct rb_node *n;
	int index = log->log_transid % 2;

	tree = &BTRFS_I(inode)->ordered_tree;
	spin_lock_irq(&tree->lock);
	for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
		ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
		spin_lock(&log->log_extents_lock[index]);
		if (list_empty(&ordered->log_list)) {
			list_add_tail(&ordered->log_list, &log->logged_list[index]);
			atomic_inc(&ordered->refs);
		}
		spin_unlock(&log->log_extents_lock[index]);
	}
	spin_unlock_irq(&tree->lock);
}

void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
{
	struct btrfs_ordered_extent *ordered;
	int index = transid % 2;

	spin_lock_irq(&log->log_extents_lock[index]);
	while (!list_empty(&log->logged_list[index])) {
		ordered = list_first_entry(&log->logged_list[index],
					   struct btrfs_ordered_extent,
					   log_list);
		list_del_init(&ordered->log_list);
		spin_unlock_irq(&log->log_extents_lock[index]);
		wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
						   &ordered->flags));
		btrfs_put_ordered_extent(ordered);
		spin_lock_irq(&log->log_extents_lock[index]);
	}
	spin_unlock_irq(&log->log_extents_lock[index]);
}

void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid)
{
	struct btrfs_ordered_extent *ordered;
	int index = transid % 2;

	spin_lock_irq(&log->log_extents_lock[index]);
	while (!list_empty(&log->logged_list[index])) {
		ordered = list_first_entry(&log->logged_list[index],
					   struct btrfs_ordered_extent,
					   log_list);
		list_del_init(&ordered->log_list);
		spin_unlock_irq(&log->log_extents_lock[index]);
		btrfs_put_ordered_extent(ordered);
		spin_lock_irq(&log->log_extents_lock[index]);
	}
	spin_unlock_irq(&log->log_extents_lock[index]);
}

486 487 488 489
/*
 * used to drop a reference on an ordered extent.  This will free
 * the extent if the last reference is dropped
 */
490
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
491
{
492 493 494
	struct list_head *cur;
	struct btrfs_ordered_sum *sum;

495 496
	trace_btrfs_ordered_extent_put(entry->inode, entry);

497
	if (atomic_dec_and_test(&entry->refs)) {
498 499
		if (entry->inode)
			btrfs_add_delayed_iput(entry->inode);
C
Chris Mason 已提交
500
		while (!list_empty(&entry->list)) {
501 502 503 504 505
			cur = entry->list.next;
			sum = list_entry(cur, struct btrfs_ordered_sum, list);
			list_del(&sum->list);
			kfree(sum);
		}
506
		kmem_cache_free(btrfs_ordered_extent_cache, entry);
507
	}
C
Chris Mason 已提交
508
}
509

510 511
/*
 * remove an ordered extent from the tree.  No references are dropped
512
 * and waiters are woken up.
513
 */
514 515
void btrfs_remove_ordered_extent(struct inode *inode,
				 struct btrfs_ordered_extent *entry)
516
{
517
	struct btrfs_ordered_inode_tree *tree;
518
	struct btrfs_root *root = BTRFS_I(inode)->root;
519 520
	struct rb_node *node;

521
	tree = &BTRFS_I(inode)->ordered_tree;
522
	spin_lock_irq(&tree->lock);
523
	node = &entry->rb_node;
524
	rb_erase(node, &tree->tree);
525 526
	tree->last = NULL;
	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
527
	spin_unlock_irq(&tree->lock);
528

529
	spin_lock(&root->ordered_extent_lock);
530
	list_del_init(&entry->root_extent_list);
531
	root->nr_ordered_extents--;
532

533 534
	trace_btrfs_ordered_extent_remove(inode, entry);

535 536 537 538 539 540 541 542 543
	/*
	 * we have no more ordered extents for this inode and
	 * no dirty pages.  We can safely remove it from the
	 * list of ordered extents
	 */
	if (RB_EMPTY_ROOT(&tree->tree) &&
	    !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
		list_del_init(&BTRFS_I(inode)->ordered_operations);
	}
544 545 546 547 548 549 550 551

	if (!root->nr_ordered_extents) {
		spin_lock(&root->fs_info->ordered_root_lock);
		BUG_ON(list_empty(&root->ordered_root));
		list_del_init(&root->ordered_root);
		spin_unlock(&root->fs_info->ordered_root_lock);
	}
	spin_unlock(&root->ordered_extent_lock);
552
	wake_up(&entry->wait);
553 554
}

555 556 557 558 559 560 561 562 563
static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
{
	struct btrfs_ordered_extent *ordered;

	ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
	btrfs_start_ordered_extent(ordered->inode, ordered, 1);
	complete(&ordered->completion);
}

C
Chris Mason 已提交
564 565 566 567
/*
 * wait for all the ordered extents in a root.  This is done when balancing
 * space between drives.
 */
568
void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
569
{
570 571
	struct list_head splice, works;
	struct btrfs_ordered_extent *ordered, *next;
572 573 574
	struct inode *inode;

	INIT_LIST_HEAD(&splice);
575
	INIT_LIST_HEAD(&works);
576

577
	mutex_lock(&root->fs_info->ordered_operations_mutex);
578 579
	spin_lock(&root->ordered_extent_lock);
	list_splice_init(&root->ordered_extents, &splice);
580
	while (!list_empty(&splice)) {
581 582 583 584
		ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
					   root_extent_list);
		list_move_tail(&ordered->root_extent_list,
			       &root->ordered_extents);
585
		/*
586
		 * the inode may be getting freed (in sys_unlink path).
587
		 */
588
		inode = igrab(ordered->inode);
589 590 591 592
		if (!inode) {
			cond_resched_lock(&root->ordered_extent_lock);
			continue;
		}
593

594 595
		atomic_inc(&ordered->refs);
		spin_unlock(&root->ordered_extent_lock);
596

597 598 599 600
		ordered->flush_work.func = btrfs_run_ordered_extent_work;
		list_add_tail(&ordered->work_list, &works);
		btrfs_queue_worker(&root->fs_info->flush_workers,
				   &ordered->flush_work);
601

602
		cond_resched();
603
		spin_lock(&root->ordered_extent_lock);
604
	}
605
	spin_unlock(&root->ordered_extent_lock);
606 607 608 609 610 611 612 613 614 615 616 617 618 619

	list_for_each_entry_safe(ordered, next, &works, work_list) {
		list_del_init(&ordered->work_list);
		wait_for_completion(&ordered->completion);

		inode = ordered->inode;
		btrfs_put_ordered_extent(ordered);
		if (delay_iput)
			btrfs_add_delayed_iput(inode);
		else
			iput(inode);

		cond_resched();
	}
620
	mutex_unlock(&root->fs_info->ordered_operations_mutex);
621 622
}

623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649
void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info,
				    int delay_iput)
{
	struct btrfs_root *root;
	struct list_head splice;

	INIT_LIST_HEAD(&splice);

	spin_lock(&fs_info->ordered_root_lock);
	list_splice_init(&fs_info->ordered_roots, &splice);
	while (!list_empty(&splice)) {
		root = list_first_entry(&splice, struct btrfs_root,
					ordered_root);
		root = btrfs_grab_fs_root(root);
		BUG_ON(!root);
		list_move_tail(&root->ordered_root,
			       &fs_info->ordered_roots);
		spin_unlock(&fs_info->ordered_root_lock);

		btrfs_wait_ordered_extents(root, delay_iput);
		btrfs_put_fs_root(root);

		spin_lock(&fs_info->ordered_root_lock);
	}
	spin_unlock(&fs_info->ordered_root_lock);
}

650 651 652 653 654 655 656 657 658 659
/*
 * this is used during transaction commit to write all the inodes
 * added to the ordered operation list.  These files must be fully on
 * disk before the transaction commits.
 *
 * we have two modes here, one is to just start the IO via filemap_flush
 * and the other is to wait for all the io.  When we wait, we have an
 * extra check to make sure the ordered operation list really is empty
 * before we return
 */
660 661
int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root, int wait)
662 663 664
{
	struct btrfs_inode *btrfs_inode;
	struct inode *inode;
665
	struct btrfs_transaction *cur_trans = trans->transaction;
666
	struct list_head splice;
667 668 669
	struct list_head works;
	struct btrfs_delalloc_work *work, *next;
	int ret = 0;
670 671

	INIT_LIST_HEAD(&splice);
672
	INIT_LIST_HEAD(&works);
673 674

	mutex_lock(&root->fs_info->ordered_operations_mutex);
675
	spin_lock(&root->fs_info->ordered_root_lock);
676
	list_splice_init(&cur_trans->ordered_operations, &splice);
677 678 679 680 681 682 683 684 685 686 687
	while (!list_empty(&splice)) {
		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
				   ordered_operations);
		inode = &btrfs_inode->vfs_inode;

		list_del_init(&btrfs_inode->ordered_operations);

		/*
		 * the inode may be getting freed (in sys_unlink path).
		 */
		inode = igrab(inode);
688 689
		if (!inode)
			continue;
690 691 692

		if (!wait)
			list_add_tail(&BTRFS_I(inode)->ordered_operations,
693
				      &cur_trans->ordered_operations);
694
		spin_unlock(&root->fs_info->ordered_root_lock);
695

696 697
		work = btrfs_alloc_delalloc_work(inode, wait, 1);
		if (!work) {
698
			spin_lock(&root->fs_info->ordered_root_lock);
699 700 701 702
			if (list_empty(&BTRFS_I(inode)->ordered_operations))
				list_add_tail(&btrfs_inode->ordered_operations,
					      &splice);
			list_splice_tail(&splice,
703
					 &cur_trans->ordered_operations);
704
			spin_unlock(&root->fs_info->ordered_root_lock);
705 706
			ret = -ENOMEM;
			goto out;
707
		}
708 709 710
		list_add_tail(&work->list, &works);
		btrfs_queue_worker(&root->fs_info->flush_workers,
				   &work->work);
711 712

		cond_resched();
713
		spin_lock(&root->fs_info->ordered_root_lock);
714
	}
715
	spin_unlock(&root->fs_info->ordered_root_lock);
716 717 718 719 720
out:
	list_for_each_entry_safe(work, next, &works, list) {
		list_del_init(&work->list);
		btrfs_wait_and_free_delalloc_work(work);
	}
721
	mutex_unlock(&root->fs_info->ordered_operations_mutex);
722
	return ret;
723 724
}

725 726 727 728 729 730 731 732 733 734
/*
 * Used to start IO or wait for a given ordered extent to finish.
 *
 * If wait is one, this effectively waits on page writeback for all the pages
 * in the extent, and it waits on the io completion code to insert
 * metadata into the btree corresponding to the extent
 */
void btrfs_start_ordered_extent(struct inode *inode,
				       struct btrfs_ordered_extent *entry,
				       int wait)
735 736 737
{
	u64 start = entry->file_offset;
	u64 end = start + entry->len - 1;
738

739 740
	trace_btrfs_ordered_extent_start(inode, entry);

741 742 743
	/*
	 * pages in the range can be dirty, clean or writeback.  We
	 * start IO on any dirty ones so the wait doesn't stall waiting
744
	 * for the flusher thread to find them
745
	 */
746 747
	if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
		filemap_fdatawrite_range(inode->i_mapping, start, end);
C
Chris Mason 已提交
748
	if (wait) {
749 750
		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
						 &entry->flags));
C
Chris Mason 已提交
751
	}
752
}
753

754 755 756
/*
 * Used to wait on ordered extents across a large range of bytes.
 */
757
void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
758 759
{
	u64 end;
760
	u64 orig_end;
761
	struct btrfs_ordered_extent *ordered;
762 763

	if (start + len < start) {
764
		orig_end = INT_LIMIT(loff_t);
765 766
	} else {
		orig_end = start + len - 1;
767 768
		if (orig_end > INT_LIMIT(loff_t))
			orig_end = INT_LIMIT(loff_t);
769
	}
770

771 772 773
	/* start IO across the range first to instantiate any delalloc
	 * extents
	 */
774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794
	filemap_fdatawrite_range(inode->i_mapping, start, orig_end);

	/*
	 * So with compression we will find and lock a dirty page and clear the
	 * first one as dirty, setup an async extent, and immediately return
	 * with the entire range locked but with nobody actually marked with
	 * writeback.  So we can't just filemap_write_and_wait_range() and
	 * expect it to work since it will just kick off a thread to do the
	 * actual work.  So we need to call filemap_fdatawrite_range _again_
	 * since it will wait on the page lock, which won't be unlocked until
	 * after the pages have been marked as writeback and so we're good to go
	 * from there.  We have to do this otherwise we'll miss the ordered
	 * extents and that results in badness.  Please Josef, do not think you
	 * know better and pull this out at some point in the future, it is
	 * right and you are wrong.
	 */
	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
		     &BTRFS_I(inode)->runtime_flags))
		filemap_fdatawrite_range(inode->i_mapping, start, orig_end);

	filemap_fdatawait_range(inode->i_mapping, start, orig_end);
795

796
	end = orig_end;
C
Chris Mason 已提交
797
	while (1) {
798
		ordered = btrfs_lookup_first_ordered_extent(inode, end);
C
Chris Mason 已提交
799
		if (!ordered)
800
			break;
801
		if (ordered->file_offset > orig_end) {
802 803 804 805 806 807 808
			btrfs_put_ordered_extent(ordered);
			break;
		}
		if (ordered->file_offset + ordered->len < start) {
			btrfs_put_ordered_extent(ordered);
			break;
		}
809
		btrfs_start_ordered_extent(inode, ordered, 1);
810 811
		end = ordered->file_offset;
		btrfs_put_ordered_extent(ordered);
812
		if (end == 0 || end == start)
813 814 815
			break;
		end--;
	}
816 817
}

818 819 820 821
/*
 * find an ordered extent corresponding to file_offset.  return NULL if
 * nothing is found, otherwise take a reference on the extent and return it
 */
822 823 824 825 826 827 828 829
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
							 u64 file_offset)
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;

	tree = &BTRFS_I(inode)->ordered_tree;
830
	spin_lock_irq(&tree->lock);
831 832 833 834 835 836 837 838 839 840
	node = tree_search(tree, file_offset);
	if (!node)
		goto out;

	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
	if (!offset_in_entry(entry, file_offset))
		entry = NULL;
	if (entry)
		atomic_inc(&entry->refs);
out:
841
	spin_unlock_irq(&tree->lock);
842 843 844
	return entry;
}

845 846 847 848 849 850 851 852 853 854 855 856
/* Since the DIO code tries to lock a wide area we need to look for any ordered
 * extents that exist in the range, rather than just the start of the range.
 */
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
							u64 file_offset,
							u64 len)
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;

	tree = &BTRFS_I(inode)->ordered_tree;
857
	spin_lock_irq(&tree->lock);
858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881
	node = tree_search(tree, file_offset);
	if (!node) {
		node = tree_search(tree, file_offset + len);
		if (!node)
			goto out;
	}

	while (1) {
		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
		if (range_overlaps(entry, file_offset, len))
			break;

		if (entry->file_offset >= file_offset + len) {
			entry = NULL;
			break;
		}
		entry = NULL;
		node = rb_next(node);
		if (!node)
			break;
	}
out:
	if (entry)
		atomic_inc(&entry->refs);
882
	spin_unlock_irq(&tree->lock);
883 884 885
	return entry;
}

886 887 888 889
/*
 * lookup and return any extent before 'file_offset'.  NULL is returned
 * if none is found
 */
890
struct btrfs_ordered_extent *
C
Chris Mason 已提交
891
btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
892 893 894 895 896 897
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;

	tree = &BTRFS_I(inode)->ordered_tree;
898
	spin_lock_irq(&tree->lock);
899 900 901 902 903 904 905
	node = tree_search(tree, file_offset);
	if (!node)
		goto out;

	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
	atomic_inc(&entry->refs);
out:
906
	spin_unlock_irq(&tree->lock);
907
	return entry;
908
}
909

910 911 912 913
/*
 * After an extent is done, call this to conditionally update the on disk
 * i_size.  i_size is updated to cover any fully written part of the file.
 */
914
int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
915 916 917 918 919
				struct btrfs_ordered_extent *ordered)
{
	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
	u64 disk_i_size;
	u64 new_i_size;
920
	u64 i_size = i_size_read(inode);
921
	struct rb_node *node;
922
	struct rb_node *prev = NULL;
923
	struct btrfs_ordered_extent *test;
924 925 926 927
	int ret = 1;

	if (ordered)
		offset = entry_end(ordered);
928 929
	else
		offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
930

931
	spin_lock_irq(&tree->lock);
932 933
	disk_i_size = BTRFS_I(inode)->disk_i_size;

934 935 936 937 938 939 940
	/* truncate file */
	if (disk_i_size > i_size) {
		BTRFS_I(inode)->disk_i_size = i_size;
		ret = 0;
		goto out;
	}

941 942 943 944
	/*
	 * if the disk i_size is already at the inode->i_size, or
	 * this ordered extent is inside the disk i_size, we're done
	 */
J
Josef Bacik 已提交
945 946 947 948 949 950 951 952 953
	if (disk_i_size == i_size)
		goto out;

	/*
	 * We still need to update disk_i_size if outstanding_isize is greater
	 * than disk_i_size.
	 */
	if (offset <= disk_i_size &&
	    (!ordered || ordered->outstanding_isize <= disk_i_size))
954 955 956 957 958 959 960
		goto out;

	/*
	 * walk backward from this ordered extent to disk_i_size.
	 * if we find an ordered extent then we can't update disk i_size
	 * yet
	 */
961 962 963 964 965 966 967 968 969 970 971 972 973 974 975
	if (ordered) {
		node = rb_prev(&ordered->rb_node);
	} else {
		prev = tree_search(tree, offset);
		/*
		 * we insert file extents without involving ordered struct,
		 * so there should be no ordered struct cover this offset
		 */
		if (prev) {
			test = rb_entry(prev, struct btrfs_ordered_extent,
					rb_node);
			BUG_ON(offset_in_entry(test, offset));
		}
		node = prev;
	}
976
	for (; node; node = rb_prev(node)) {
977
		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
978 979 980 981

		/* We treat this entry as if it doesnt exist */
		if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
			continue;
982 983
		if (test->file_offset + test->len <= disk_i_size)
			break;
984
		if (test->file_offset >= i_size)
985
			break;
986
		if (entry_end(test) > disk_i_size) {
987 988 989 990 991 992 993 994 995 996 997 998
			/*
			 * we don't update disk_i_size now, so record this
			 * undealt i_size. Or we will not know the real
			 * i_size.
			 */
			if (test->outstanding_isize < offset)
				test->outstanding_isize = offset;
			if (ordered &&
			    ordered->outstanding_isize >
			    test->outstanding_isize)
				test->outstanding_isize =
						ordered->outstanding_isize;
999
			goto out;
1000
		}
1001
	}
1002
	new_i_size = min_t(u64, offset, i_size);
1003 1004

	/*
1005 1006
	 * Some ordered extents may completed before the current one, and
	 * we hold the real i_size in ->outstanding_isize.
1007
	 */
1008 1009
	if (ordered && ordered->outstanding_isize > new_i_size)
		new_i_size = min_t(u64, ordered->outstanding_isize, i_size);
1010
	BTRFS_I(inode)->disk_i_size = new_i_size;
1011
	ret = 0;
1012
out:
1013
	/*
1014 1015 1016 1017 1018
	 * We need to do this because we can't remove ordered extents until
	 * after the i_disk_size has been updated and then the inode has been
	 * updated to reflect the change, so we need to tell anybody who finds
	 * this ordered extent that we've already done all the real work, we
	 * just haven't completed all the other work.
1019 1020
	 */
	if (ordered)
1021 1022
		set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags);
	spin_unlock_irq(&tree->lock);
1023
	return ret;
1024
}
1025

1026 1027 1028 1029 1030
/*
 * search the ordered extents for one corresponding to 'offset' and
 * try to find a checksum.  This is used because we allow pages to
 * be reclaimed before their checksum is actually put into the btree
 */
1031
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
1032
			   u32 *sum, int len)
1033 1034 1035 1036 1037
{
	struct btrfs_ordered_sum *ordered_sum;
	struct btrfs_sector_sum *sector_sums;
	struct btrfs_ordered_extent *ordered;
	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
1038 1039 1040
	unsigned long num_sectors;
	unsigned long i;
	u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
1041
	int index = 0;
1042 1043 1044

	ordered = btrfs_lookup_ordered_extent(inode, offset);
	if (!ordered)
1045
		return 0;
1046

1047
	spin_lock_irq(&tree->lock);
Q
Qinghuang Feng 已提交
1048
	list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
1049 1050 1051 1052 1053 1054 1055 1056
		if (disk_bytenr >= ordered_sum->bytenr &&
		    disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
			i = (disk_bytenr - ordered_sum->bytenr) >>
			    inode->i_sb->s_blocksize_bits;
			sector_sums = ordered_sum->sums + i;
			num_sectors = ordered_sum->len >>
				      inode->i_sb->s_blocksize_bits;
			for (; i < num_sectors; i++) {
1057
				if (sector_sums[i].bytenr == disk_bytenr) {
1058 1059 1060 1061 1062
					sum[index] = sector_sums[i].sum;
					index++;
					if (index == len)
						goto out;
					disk_bytenr += sectorsize;
1063 1064
				}
			}
1065 1066 1067
		}
	}
out:
1068
	spin_unlock_irq(&tree->lock);
1069
	btrfs_put_ordered_extent(ordered);
1070
	return index;
1071 1072
}

1073

1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085
/*
 * add a given inode to the list of inodes that must be fully on
 * disk before a transaction commit finishes.
 *
 * This basically gives us the ext3 style data=ordered mode, and it is mostly
 * used to make sure renamed files are fully on disk.
 *
 * It is a noop if the inode is already fully on disk.
 *
 * If trans is not null, we'll do a friendly check for a transaction that
 * is already flushing things and force the IO down ourselves.
 */
1086 1087
void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root, struct inode *inode)
1088
{
1089
	struct btrfs_transaction *cur_trans = trans->transaction;
1090 1091 1092 1093 1094 1095 1096 1097 1098
	u64 last_mod;

	last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);

	/*
	 * if this file hasn't been changed since the last transaction
	 * commit, we can safely return without doing anything
	 */
	if (last_mod < root->fs_info->last_trans_committed)
1099
		return;
1100

1101
	spin_lock(&root->fs_info->ordered_root_lock);
1102 1103
	if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
		list_add_tail(&BTRFS_I(inode)->ordered_operations,
1104
			      &cur_trans->ordered_operations);
1105
	}
1106
	spin_unlock(&root->fs_info->ordered_root_lock);
1107
}
1108 1109 1110 1111 1112 1113 1114 1115 1116

int __init ordered_data_init(void)
{
	btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
				     sizeof(struct btrfs_ordered_extent), 0,
				     SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
				     NULL);
	if (!btrfs_ordered_extent_cache)
		return -ENOMEM;
1117

1118 1119 1120 1121 1122 1123 1124 1125
	return 0;
}

void ordered_data_exit(void)
{
	if (btrfs_ordered_extent_cache)
		kmem_cache_destroy(btrfs_ordered_extent_cache);
}