ordered-data.c 31.5 KB
Newer Older
C
Chris Mason 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

#include <linux/slab.h>
20
#include <linux/blkdev.h>
21 22
#include <linux/writeback.h>
#include <linux/pagevec.h>
C
Chris Mason 已提交
23 24 25
#include "ctree.h"
#include "transaction.h"
#include "btrfs_inode.h"
26
#include "extent_io.h"
27
#include "disk-io.h"
C
Chris Mason 已提交
28

29 30
static struct kmem_cache *btrfs_ordered_extent_cache;

31
static u64 entry_end(struct btrfs_ordered_extent *entry)
C
Chris Mason 已提交
32
{
33 34 35
	if (entry->file_offset + entry->len < entry->file_offset)
		return (u64)-1;
	return entry->file_offset + entry->len;
C
Chris Mason 已提交
36 37
}

C
Chris Mason 已提交
38 39 40
/* returns NULL if the insertion worked, or it returns the node it did find
 * in the tree
 */
41 42
static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
				   struct rb_node *node)
C
Chris Mason 已提交
43
{
C
Chris Mason 已提交
44 45
	struct rb_node **p = &root->rb_node;
	struct rb_node *parent = NULL;
46
	struct btrfs_ordered_extent *entry;
C
Chris Mason 已提交
47

C
Chris Mason 已提交
48
	while (*p) {
C
Chris Mason 已提交
49
		parent = *p;
50
		entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
C
Chris Mason 已提交
51

52
		if (file_offset < entry->file_offset)
C
Chris Mason 已提交
53
			p = &(*p)->rb_left;
54
		else if (file_offset >= entry_end(entry))
C
Chris Mason 已提交
55 56 57 58 59 60 61 62 63 64
			p = &(*p)->rb_right;
		else
			return parent;
	}

	rb_link_node(node, parent, p);
	rb_insert_color(node, root);
	return NULL;
}

65 66 67 68 69
static void ordered_data_tree_panic(struct inode *inode, int errno,
					       u64 offset)
{
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset "
70
		    "%llu\n", offset);
71 72
}

C
Chris Mason 已提交
73 74 75 76
/*
 * look for a given offset in the tree, and if it can't be found return the
 * first lesser offset
 */
77 78
static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
				     struct rb_node **prev_ret)
C
Chris Mason 已提交
79
{
C
Chris Mason 已提交
80
	struct rb_node *n = root->rb_node;
C
Chris Mason 已提交
81
	struct rb_node *prev = NULL;
82 83 84
	struct rb_node *test;
	struct btrfs_ordered_extent *entry;
	struct btrfs_ordered_extent *prev_entry = NULL;
C
Chris Mason 已提交
85

C
Chris Mason 已提交
86
	while (n) {
87
		entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
C
Chris Mason 已提交
88 89 90
		prev = n;
		prev_entry = entry;

91
		if (file_offset < entry->file_offset)
C
Chris Mason 已提交
92
			n = n->rb_left;
93
		else if (file_offset >= entry_end(entry))
C
Chris Mason 已提交
94 95 96 97 98 99 100
			n = n->rb_right;
		else
			return n;
	}
	if (!prev_ret)
		return NULL;

C
Chris Mason 已提交
101
	while (prev && file_offset >= entry_end(prev_entry)) {
102 103 104 105 106 107 108 109 110 111 112 113 114
		test = rb_next(prev);
		if (!test)
			break;
		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
				      rb_node);
		if (file_offset < entry_end(prev_entry))
			break;

		prev = test;
	}
	if (prev)
		prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
				      rb_node);
C
Chris Mason 已提交
115
	while (prev && file_offset < entry_end(prev_entry)) {
116 117 118 119 120 121
		test = rb_prev(prev);
		if (!test)
			break;
		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
				      rb_node);
		prev = test;
C
Chris Mason 已提交
122 123 124 125 126
	}
	*prev_ret = prev;
	return NULL;
}

C
Chris Mason 已提交
127 128 129
/*
 * helper to check if a given offset is inside a given entry
 */
130 131 132 133 134 135 136 137
static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
{
	if (file_offset < entry->file_offset ||
	    entry->file_offset + entry->len <= file_offset)
		return 0;
	return 1;
}

138 139 140 141 142 143 144 145 146
static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
			  u64 len)
{
	if (file_offset + len <= entry->file_offset ||
	    entry->file_offset + entry->len <= file_offset)
		return 0;
	return 1;
}

C
Chris Mason 已提交
147 148 149 150
/*
 * look find the first ordered struct that has this offset, otherwise
 * the first one less than this offset
 */
151 152
static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
					  u64 file_offset)
C
Chris Mason 已提交
153
{
154
	struct rb_root *root = &tree->tree;
155
	struct rb_node *prev = NULL;
C
Chris Mason 已提交
156
	struct rb_node *ret;
157 158 159 160 161 162 163 164 165
	struct btrfs_ordered_extent *entry;

	if (tree->last) {
		entry = rb_entry(tree->last, struct btrfs_ordered_extent,
				 rb_node);
		if (offset_in_entry(entry, file_offset))
			return tree->last;
	}
	ret = __tree_search(root, file_offset, &prev);
C
Chris Mason 已提交
166
	if (!ret)
167 168 169
		ret = prev;
	if (ret)
		tree->last = ret;
C
Chris Mason 已提交
170 171 172
	return ret;
}

173 174 175 176 177 178 179 180 181 182 183
/* allocate and add a new ordered_extent into the per-inode tree.
 * file_offset is the logical offset in the file
 *
 * start is the disk block number of an extent already reserved in the
 * extent allocation tree
 *
 * len is the length of the extent
 *
 * The tree is given a single reference on the ordered extent that was
 * inserted.
 */
184 185
static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
				      u64 start, u64 len, u64 disk_len,
186
				      int type, int dio, int compress_type)
C
Chris Mason 已提交
187
{
188
	struct btrfs_root *root = BTRFS_I(inode)->root;
C
Chris Mason 已提交
189
	struct btrfs_ordered_inode_tree *tree;
190 191
	struct rb_node *node;
	struct btrfs_ordered_extent *entry;
C
Chris Mason 已提交
192

193
	tree = &BTRFS_I(inode)->ordered_tree;
194
	entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
C
Chris Mason 已提交
195 196 197
	if (!entry)
		return -ENOMEM;

198 199 200
	entry->file_offset = file_offset;
	entry->start = start;
	entry->len = len;
201 202 203
	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) &&
	    !(type == BTRFS_ORDERED_NOCOW))
		entry->csum_bytes_left = disk_len;
C
Chris Mason 已提交
204
	entry->disk_len = disk_len;
205
	entry->bytes_left = len;
206
	entry->inode = igrab(inode);
207
	entry->compress_type = compress_type;
208
	entry->truncated_len = (u64)-1;
Y
Yan Zheng 已提交
209
	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
Y
Yan Zheng 已提交
210
		set_bit(type, &entry->flags);
211

212 213 214
	if (dio)
		set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);

215 216 217 218
	/* one ref for the tree */
	atomic_set(&entry->refs, 1);
	init_waitqueue_head(&entry->wait);
	INIT_LIST_HEAD(&entry->list);
219
	INIT_LIST_HEAD(&entry->root_extent_list);
220 221
	INIT_LIST_HEAD(&entry->work_list);
	init_completion(&entry->completion);
222
	INIT_LIST_HEAD(&entry->log_list);
C
Chris Mason 已提交
223

224 225
	trace_btrfs_ordered_extent_add(inode, entry);

226
	spin_lock_irq(&tree->lock);
227 228
	node = tree_insert(&tree->tree, file_offset,
			   &entry->rb_node);
229 230
	if (node)
		ordered_data_tree_panic(inode, -EEXIST, file_offset);
231
	spin_unlock_irq(&tree->lock);
C
Chris Mason 已提交
232

233
	spin_lock(&root->ordered_extent_lock);
234
	list_add_tail(&entry->root_extent_list,
235 236 237 238 239 240 241 242 243 244
		      &root->ordered_extents);
	root->nr_ordered_extents++;
	if (root->nr_ordered_extents == 1) {
		spin_lock(&root->fs_info->ordered_root_lock);
		BUG_ON(!list_empty(&root->ordered_root));
		list_add_tail(&root->ordered_root,
			      &root->fs_info->ordered_roots);
		spin_unlock(&root->fs_info->ordered_root_lock);
	}
	spin_unlock(&root->ordered_extent_lock);
245

C
Chris Mason 已提交
246 247 248
	return 0;
}

249 250 251 252
int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
			     u64 start, u64 len, u64 disk_len, int type)
{
	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
253 254
					  disk_len, type, 0,
					  BTRFS_COMPRESS_NONE);
255 256 257 258 259 260
}

int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
				 u64 start, u64 len, u64 disk_len, int type)
{
	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
261 262 263 264 265 266 267 268 269 270 271
					  disk_len, type, 1,
					  BTRFS_COMPRESS_NONE);
}

int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
				      u64 start, u64 len, u64 disk_len,
				      int type, int compress_type)
{
	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
					  disk_len, type, 0,
					  compress_type);
272 273
}

274 275
/*
 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
276 277
 * when an ordered extent is finished.  If the list covers more than one
 * ordered extent, it is split across multiples.
278
 */
279 280 281
void btrfs_add_ordered_sum(struct inode *inode,
			   struct btrfs_ordered_extent *entry,
			   struct btrfs_ordered_sum *sum)
C
Chris Mason 已提交
282
{
283
	struct btrfs_ordered_inode_tree *tree;
C
Chris Mason 已提交
284

285
	tree = &BTRFS_I(inode)->ordered_tree;
286
	spin_lock_irq(&tree->lock);
287
	list_add_tail(&sum->list, &entry->list);
288 289 290 291
	WARN_ON(entry->csum_bytes_left < sum->len);
	entry->csum_bytes_left -= sum->len;
	if (entry->csum_bytes_left == 0)
		wake_up(&entry->wait);
292
	spin_unlock_irq(&tree->lock);
C
Chris Mason 已提交
293 294
}

295 296 297 298 299 300 301 302 303 304 305 306 307 308
/*
 * this is used to account for finished IO across a given range
 * of the file.  The IO may span ordered extents.  If
 * a given ordered_extent is completely done, 1 is returned, otherwise
 * 0.
 *
 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
 * to make sure this function only returns 1 once for a given ordered extent.
 *
 * file_offset is updated to one byte past the range that is recorded as
 * complete.  This allows you to walk forward in the file.
 */
int btrfs_dec_test_first_ordered_pending(struct inode *inode,
				   struct btrfs_ordered_extent **cached,
309
				   u64 *file_offset, u64 io_size, int uptodate)
310 311 312 313 314
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;
	int ret;
315
	unsigned long flags;
316 317 318 319 320
	u64 dec_end;
	u64 dec_start;
	u64 to_dec;

	tree = &BTRFS_I(inode)->ordered_tree;
321
	spin_lock_irqsave(&tree->lock, flags);
322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339
	node = tree_search(tree, *file_offset);
	if (!node) {
		ret = 1;
		goto out;
	}

	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
	if (!offset_in_entry(entry, *file_offset)) {
		ret = 1;
		goto out;
	}

	dec_start = max(*file_offset, entry->file_offset);
	dec_end = min(*file_offset + io_size, entry->file_offset +
		      entry->len);
	*file_offset = dec_end;
	if (dec_start > dec_end) {
		printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n",
340
		       dec_start, dec_end);
341 342 343 344
	}
	to_dec = dec_end - dec_start;
	if (to_dec > entry->bytes_left) {
		printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
345
		       entry->bytes_left, to_dec);
346 347
	}
	entry->bytes_left -= to_dec;
348 349 350
	if (!uptodate)
		set_bit(BTRFS_ORDERED_IOERR, &entry->flags);

351 352 353 354 355 356 357 358 359
	if (entry->bytes_left == 0)
		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
	else
		ret = 1;
out:
	if (!ret && cached && entry) {
		*cached = entry;
		atomic_inc(&entry->refs);
	}
360
	spin_unlock_irqrestore(&tree->lock, flags);
361 362 363
	return ret == 0;
}

364 365 366 367 368 369 370 371 372
/*
 * this is used to account for finished IO across a given range
 * of the file.  The IO should not span ordered extents.  If
 * a given ordered_extent is completely done, 1 is returned, otherwise
 * 0.
 *
 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
 * to make sure this function only returns 1 once for a given ordered extent.
 */
373
int btrfs_dec_test_ordered_pending(struct inode *inode,
374
				   struct btrfs_ordered_extent **cached,
375
				   u64 file_offset, u64 io_size, int uptodate)
C
Chris Mason 已提交
376
{
377
	struct btrfs_ordered_inode_tree *tree;
C
Chris Mason 已提交
378
	struct rb_node *node;
379
	struct btrfs_ordered_extent *entry = NULL;
380
	unsigned long flags;
381 382 383
	int ret;

	tree = &BTRFS_I(inode)->ordered_tree;
384 385 386 387 388 389
	spin_lock_irqsave(&tree->lock, flags);
	if (cached && *cached) {
		entry = *cached;
		goto have_entry;
	}

390
	node = tree_search(tree, file_offset);
C
Chris Mason 已提交
391
	if (!node) {
392 393
		ret = 1;
		goto out;
C
Chris Mason 已提交
394 395
	}

396
	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
397
have_entry:
398 399 400
	if (!offset_in_entry(entry, file_offset)) {
		ret = 1;
		goto out;
C
Chris Mason 已提交
401
	}
402

403 404
	if (io_size > entry->bytes_left) {
		printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
405
		       entry->bytes_left, io_size);
406 407
	}
	entry->bytes_left -= io_size;
408 409 410
	if (!uptodate)
		set_bit(BTRFS_ORDERED_IOERR, &entry->flags);

411
	if (entry->bytes_left == 0)
412
		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
413 414
	else
		ret = 1;
415
out:
416 417 418 419
	if (!ret && cached && entry) {
		*cached = entry;
		atomic_inc(&entry->refs);
	}
420
	spin_unlock_irqrestore(&tree->lock, flags);
421 422
	return ret == 0;
}
C
Chris Mason 已提交
423

424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483
/* Needs to either be called under a log transaction or the log_mutex */
void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode)
{
	struct btrfs_ordered_inode_tree *tree;
	struct btrfs_ordered_extent *ordered;
	struct rb_node *n;
	int index = log->log_transid % 2;

	tree = &BTRFS_I(inode)->ordered_tree;
	spin_lock_irq(&tree->lock);
	for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
		ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
		spin_lock(&log->log_extents_lock[index]);
		if (list_empty(&ordered->log_list)) {
			list_add_tail(&ordered->log_list, &log->logged_list[index]);
			atomic_inc(&ordered->refs);
		}
		spin_unlock(&log->log_extents_lock[index]);
	}
	spin_unlock_irq(&tree->lock);
}

void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
{
	struct btrfs_ordered_extent *ordered;
	int index = transid % 2;

	spin_lock_irq(&log->log_extents_lock[index]);
	while (!list_empty(&log->logged_list[index])) {
		ordered = list_first_entry(&log->logged_list[index],
					   struct btrfs_ordered_extent,
					   log_list);
		list_del_init(&ordered->log_list);
		spin_unlock_irq(&log->log_extents_lock[index]);
		wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
						   &ordered->flags));
		btrfs_put_ordered_extent(ordered);
		spin_lock_irq(&log->log_extents_lock[index]);
	}
	spin_unlock_irq(&log->log_extents_lock[index]);
}

void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid)
{
	struct btrfs_ordered_extent *ordered;
	int index = transid % 2;

	spin_lock_irq(&log->log_extents_lock[index]);
	while (!list_empty(&log->logged_list[index])) {
		ordered = list_first_entry(&log->logged_list[index],
					   struct btrfs_ordered_extent,
					   log_list);
		list_del_init(&ordered->log_list);
		spin_unlock_irq(&log->log_extents_lock[index]);
		btrfs_put_ordered_extent(ordered);
		spin_lock_irq(&log->log_extents_lock[index]);
	}
	spin_unlock_irq(&log->log_extents_lock[index]);
}

484 485 486 487
/*
 * used to drop a reference on an ordered extent.  This will free
 * the extent if the last reference is dropped
 */
488
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
489
{
490 491 492
	struct list_head *cur;
	struct btrfs_ordered_sum *sum;

493 494
	trace_btrfs_ordered_extent_put(entry->inode, entry);

495
	if (atomic_dec_and_test(&entry->refs)) {
496 497
		if (entry->inode)
			btrfs_add_delayed_iput(entry->inode);
C
Chris Mason 已提交
498
		while (!list_empty(&entry->list)) {
499 500 501 502 503
			cur = entry->list.next;
			sum = list_entry(cur, struct btrfs_ordered_sum, list);
			list_del(&sum->list);
			kfree(sum);
		}
504
		kmem_cache_free(btrfs_ordered_extent_cache, entry);
505
	}
C
Chris Mason 已提交
506
}
507

508 509
/*
 * remove an ordered extent from the tree.  No references are dropped
510
 * and waiters are woken up.
511
 */
512 513
void btrfs_remove_ordered_extent(struct inode *inode,
				 struct btrfs_ordered_extent *entry)
514
{
515
	struct btrfs_ordered_inode_tree *tree;
516
	struct btrfs_root *root = BTRFS_I(inode)->root;
517 518
	struct rb_node *node;

519
	tree = &BTRFS_I(inode)->ordered_tree;
520
	spin_lock_irq(&tree->lock);
521
	node = &entry->rb_node;
522
	rb_erase(node, &tree->tree);
523 524
	tree->last = NULL;
	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
525
	spin_unlock_irq(&tree->lock);
526

527
	spin_lock(&root->ordered_extent_lock);
528
	list_del_init(&entry->root_extent_list);
529
	root->nr_ordered_extents--;
530

531 532
	trace_btrfs_ordered_extent_remove(inode, entry);

533 534 535 536 537 538 539
	/*
	 * we have no more ordered extents for this inode and
	 * no dirty pages.  We can safely remove it from the
	 * list of ordered extents
	 */
	if (RB_EMPTY_ROOT(&tree->tree) &&
	    !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
540
		spin_lock(&root->fs_info->ordered_root_lock);
541
		list_del_init(&BTRFS_I(inode)->ordered_operations);
542
		spin_unlock(&root->fs_info->ordered_root_lock);
543
	}
544 545 546 547 548 549 550 551

	if (!root->nr_ordered_extents) {
		spin_lock(&root->fs_info->ordered_root_lock);
		BUG_ON(list_empty(&root->ordered_root));
		list_del_init(&root->ordered_root);
		spin_unlock(&root->fs_info->ordered_root_lock);
	}
	spin_unlock(&root->ordered_extent_lock);
552
	wake_up(&entry->wait);
553 554
}

555 556 557 558 559 560 561 562 563
static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
{
	struct btrfs_ordered_extent *ordered;

	ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
	btrfs_start_ordered_extent(ordered->inode, ordered, 1);
	complete(&ordered->completion);
}

C
Chris Mason 已提交
564 565 566 567
/*
 * wait for all the ordered extents in a root.  This is done when balancing
 * space between drives.
 */
568
int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
569
{
570 571
	struct list_head splice, works;
	struct btrfs_ordered_extent *ordered, *next;
572
	int count = 0;
573 574

	INIT_LIST_HEAD(&splice);
575
	INIT_LIST_HEAD(&works);
576

577
	mutex_lock(&root->fs_info->ordered_operations_mutex);
578 579
	spin_lock(&root->ordered_extent_lock);
	list_splice_init(&root->ordered_extents, &splice);
580
	while (!list_empty(&splice) && nr) {
581 582 583 584 585 586
		ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
					   root_extent_list);
		list_move_tail(&ordered->root_extent_list,
			       &root->ordered_extents);
		atomic_inc(&ordered->refs);
		spin_unlock(&root->ordered_extent_lock);
587

588 589 590 591
		ordered->flush_work.func = btrfs_run_ordered_extent_work;
		list_add_tail(&ordered->work_list, &works);
		btrfs_queue_worker(&root->fs_info->flush_workers,
				   &ordered->flush_work);
592

593
		cond_resched();
594
		spin_lock(&root->ordered_extent_lock);
595 596 597
		if (nr != -1)
			nr--;
		count++;
598
	}
599
	list_splice_tail(&splice, &root->ordered_extents);
600
	spin_unlock(&root->ordered_extent_lock);
601 602 603 604 605 606 607

	list_for_each_entry_safe(ordered, next, &works, work_list) {
		list_del_init(&ordered->work_list);
		wait_for_completion(&ordered->completion);
		btrfs_put_ordered_extent(ordered);
		cond_resched();
	}
608
	mutex_unlock(&root->fs_info->ordered_operations_mutex);
609 610

	return count;
611 612
}

613
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
614 615 616
{
	struct btrfs_root *root;
	struct list_head splice;
617
	int done;
618 619 620 621 622

	INIT_LIST_HEAD(&splice);

	spin_lock(&fs_info->ordered_root_lock);
	list_splice_init(&fs_info->ordered_roots, &splice);
623
	while (!list_empty(&splice) && nr) {
624 625 626 627 628 629 630 631
		root = list_first_entry(&splice, struct btrfs_root,
					ordered_root);
		root = btrfs_grab_fs_root(root);
		BUG_ON(!root);
		list_move_tail(&root->ordered_root,
			       &fs_info->ordered_roots);
		spin_unlock(&fs_info->ordered_root_lock);

632
		done = btrfs_wait_ordered_extents(root, nr);
633 634 635
		btrfs_put_fs_root(root);

		spin_lock(&fs_info->ordered_root_lock);
636 637 638 639
		if (nr != -1) {
			nr -= done;
			WARN_ON(nr < 0);
		}
640
	}
641
	list_splice_tail(&splice, &fs_info->ordered_roots);
642 643 644
	spin_unlock(&fs_info->ordered_root_lock);
}

645 646 647 648 649 650 651 652 653 654
/*
 * this is used during transaction commit to write all the inodes
 * added to the ordered operation list.  These files must be fully on
 * disk before the transaction commits.
 *
 * we have two modes here, one is to just start the IO via filemap_flush
 * and the other is to wait for all the io.  When we wait, we have an
 * extra check to make sure the ordered operation list really is empty
 * before we return
 */
655 656
int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root, int wait)
657 658 659
{
	struct btrfs_inode *btrfs_inode;
	struct inode *inode;
660
	struct btrfs_transaction *cur_trans = trans->transaction;
661
	struct list_head splice;
662 663 664
	struct list_head works;
	struct btrfs_delalloc_work *work, *next;
	int ret = 0;
665 666

	INIT_LIST_HEAD(&splice);
667
	INIT_LIST_HEAD(&works);
668

669
	mutex_lock(&root->fs_info->ordered_extent_flush_mutex);
670
	spin_lock(&root->fs_info->ordered_root_lock);
671
	list_splice_init(&cur_trans->ordered_operations, &splice);
672 673 674 675 676 677 678 679 680 681 682
	while (!list_empty(&splice)) {
		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
				   ordered_operations);
		inode = &btrfs_inode->vfs_inode;

		list_del_init(&btrfs_inode->ordered_operations);

		/*
		 * the inode may be getting freed (in sys_unlink path).
		 */
		inode = igrab(inode);
683 684
		if (!inode)
			continue;
685 686 687

		if (!wait)
			list_add_tail(&BTRFS_I(inode)->ordered_operations,
688
				      &cur_trans->ordered_operations);
689
		spin_unlock(&root->fs_info->ordered_root_lock);
690

691 692
		work = btrfs_alloc_delalloc_work(inode, wait, 1);
		if (!work) {
693
			spin_lock(&root->fs_info->ordered_root_lock);
694 695 696 697
			if (list_empty(&BTRFS_I(inode)->ordered_operations))
				list_add_tail(&btrfs_inode->ordered_operations,
					      &splice);
			list_splice_tail(&splice,
698
					 &cur_trans->ordered_operations);
699
			spin_unlock(&root->fs_info->ordered_root_lock);
700 701
			ret = -ENOMEM;
			goto out;
702
		}
703 704 705
		list_add_tail(&work->list, &works);
		btrfs_queue_worker(&root->fs_info->flush_workers,
				   &work->work);
706 707

		cond_resched();
708
		spin_lock(&root->fs_info->ordered_root_lock);
709
	}
710
	spin_unlock(&root->fs_info->ordered_root_lock);
711 712 713 714 715
out:
	list_for_each_entry_safe(work, next, &works, list) {
		list_del_init(&work->list);
		btrfs_wait_and_free_delalloc_work(work);
	}
716
	mutex_unlock(&root->fs_info->ordered_extent_flush_mutex);
717
	return ret;
718 719
}

720 721 722 723 724 725 726 727 728 729
/*
 * Used to start IO or wait for a given ordered extent to finish.
 *
 * If wait is one, this effectively waits on page writeback for all the pages
 * in the extent, and it waits on the io completion code to insert
 * metadata into the btree corresponding to the extent
 */
void btrfs_start_ordered_extent(struct inode *inode,
				       struct btrfs_ordered_extent *entry,
				       int wait)
730 731 732
{
	u64 start = entry->file_offset;
	u64 end = start + entry->len - 1;
733

734 735
	trace_btrfs_ordered_extent_start(inode, entry);

736 737 738
	/*
	 * pages in the range can be dirty, clean or writeback.  We
	 * start IO on any dirty ones so the wait doesn't stall waiting
739
	 * for the flusher thread to find them
740
	 */
741 742
	if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
		filemap_fdatawrite_range(inode->i_mapping, start, end);
C
Chris Mason 已提交
743
	if (wait) {
744 745
		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
						 &entry->flags));
C
Chris Mason 已提交
746
	}
747
}
748

749 750 751
/*
 * Used to wait on ordered extents across a large range of bytes.
 */
752
int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
753
{
754
	int ret = 0;
755
	u64 end;
756
	u64 orig_end;
757
	struct btrfs_ordered_extent *ordered;
758 759

	if (start + len < start) {
760
		orig_end = INT_LIMIT(loff_t);
761 762
	} else {
		orig_end = start + len - 1;
763 764
		if (orig_end > INT_LIMIT(loff_t))
			orig_end = INT_LIMIT(loff_t);
765
	}
766

767 768 769
	/* start IO across the range first to instantiate any delalloc
	 * extents
	 */
770 771 772
	ret = filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
	if (ret)
		return ret;
773 774 775 776 777 778 779 780 781 782 783 784 785 786 787
	/*
	 * So with compression we will find and lock a dirty page and clear the
	 * first one as dirty, setup an async extent, and immediately return
	 * with the entire range locked but with nobody actually marked with
	 * writeback.  So we can't just filemap_write_and_wait_range() and
	 * expect it to work since it will just kick off a thread to do the
	 * actual work.  So we need to call filemap_fdatawrite_range _again_
	 * since it will wait on the page lock, which won't be unlocked until
	 * after the pages have been marked as writeback and so we're good to go
	 * from there.  We have to do this otherwise we'll miss the ordered
	 * extents and that results in badness.  Please Josef, do not think you
	 * know better and pull this out at some point in the future, it is
	 * right and you are wrong.
	 */
	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
788 789 790 791 792 793 794 795 796
		     &BTRFS_I(inode)->runtime_flags)) {
		ret = filemap_fdatawrite_range(inode->i_mapping, start,
					       orig_end);
		if (ret)
			return ret;
	}
	ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
	if (ret)
		return ret;
797

798
	end = orig_end;
C
Chris Mason 已提交
799
	while (1) {
800
		ordered = btrfs_lookup_first_ordered_extent(inode, end);
C
Chris Mason 已提交
801
		if (!ordered)
802
			break;
803
		if (ordered->file_offset > orig_end) {
804 805 806
			btrfs_put_ordered_extent(ordered);
			break;
		}
807
		if (ordered->file_offset + ordered->len <= start) {
808 809 810
			btrfs_put_ordered_extent(ordered);
			break;
		}
811
		btrfs_start_ordered_extent(inode, ordered, 1);
812
		end = ordered->file_offset;
813 814
		if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
			ret = -EIO;
815
		btrfs_put_ordered_extent(ordered);
816
		if (ret || end == 0 || end == start)
817 818 819
			break;
		end--;
	}
820
	return ret;
821 822
}

823 824 825 826
/*
 * find an ordered extent corresponding to file_offset.  return NULL if
 * nothing is found, otherwise take a reference on the extent and return it
 */
827 828 829 830 831 832 833 834
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
							 u64 file_offset)
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;

	tree = &BTRFS_I(inode)->ordered_tree;
835
	spin_lock_irq(&tree->lock);
836 837 838 839 840 841 842 843 844 845
	node = tree_search(tree, file_offset);
	if (!node)
		goto out;

	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
	if (!offset_in_entry(entry, file_offset))
		entry = NULL;
	if (entry)
		atomic_inc(&entry->refs);
out:
846
	spin_unlock_irq(&tree->lock);
847 848 849
	return entry;
}

850 851 852 853 854 855 856 857 858 859 860 861
/* Since the DIO code tries to lock a wide area we need to look for any ordered
 * extents that exist in the range, rather than just the start of the range.
 */
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
							u64 file_offset,
							u64 len)
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;

	tree = &BTRFS_I(inode)->ordered_tree;
862
	spin_lock_irq(&tree->lock);
863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886
	node = tree_search(tree, file_offset);
	if (!node) {
		node = tree_search(tree, file_offset + len);
		if (!node)
			goto out;
	}

	while (1) {
		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
		if (range_overlaps(entry, file_offset, len))
			break;

		if (entry->file_offset >= file_offset + len) {
			entry = NULL;
			break;
		}
		entry = NULL;
		node = rb_next(node);
		if (!node)
			break;
	}
out:
	if (entry)
		atomic_inc(&entry->refs);
887
	spin_unlock_irq(&tree->lock);
888 889 890
	return entry;
}

891 892 893 894
/*
 * lookup and return any extent before 'file_offset'.  NULL is returned
 * if none is found
 */
895
struct btrfs_ordered_extent *
C
Chris Mason 已提交
896
btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
897 898 899 900 901 902
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;

	tree = &BTRFS_I(inode)->ordered_tree;
903
	spin_lock_irq(&tree->lock);
904 905 906 907 908 909 910
	node = tree_search(tree, file_offset);
	if (!node)
		goto out;

	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
	atomic_inc(&entry->refs);
out:
911
	spin_unlock_irq(&tree->lock);
912
	return entry;
913
}
914

915 916 917 918
/*
 * After an extent is done, call this to conditionally update the on disk
 * i_size.  i_size is updated to cover any fully written part of the file.
 */
919
int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
920 921 922 923 924
				struct btrfs_ordered_extent *ordered)
{
	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
	u64 disk_i_size;
	u64 new_i_size;
925
	u64 i_size = i_size_read(inode);
926
	struct rb_node *node;
927
	struct rb_node *prev = NULL;
928
	struct btrfs_ordered_extent *test;
929 930
	int ret = 1;

931 932
	spin_lock_irq(&tree->lock);
	if (ordered) {
933
		offset = entry_end(ordered);
934 935 936 937 938
		if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags))
			offset = min(offset,
				     ordered->file_offset +
				     ordered->truncated_len);
	} else {
939
		offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
940
	}
941 942
	disk_i_size = BTRFS_I(inode)->disk_i_size;

943 944 945 946 947 948 949
	/* truncate file */
	if (disk_i_size > i_size) {
		BTRFS_I(inode)->disk_i_size = i_size;
		ret = 0;
		goto out;
	}

950 951 952 953
	/*
	 * if the disk i_size is already at the inode->i_size, or
	 * this ordered extent is inside the disk i_size, we're done
	 */
J
Josef Bacik 已提交
954 955 956 957 958 959 960 961 962
	if (disk_i_size == i_size)
		goto out;

	/*
	 * We still need to update disk_i_size if outstanding_isize is greater
	 * than disk_i_size.
	 */
	if (offset <= disk_i_size &&
	    (!ordered || ordered->outstanding_isize <= disk_i_size))
963 964 965 966 967 968 969
		goto out;

	/*
	 * walk backward from this ordered extent to disk_i_size.
	 * if we find an ordered extent then we can't update disk i_size
	 * yet
	 */
970 971 972 973 974 975 976 977 978 979 980 981 982 983 984
	if (ordered) {
		node = rb_prev(&ordered->rb_node);
	} else {
		prev = tree_search(tree, offset);
		/*
		 * we insert file extents without involving ordered struct,
		 * so there should be no ordered struct cover this offset
		 */
		if (prev) {
			test = rb_entry(prev, struct btrfs_ordered_extent,
					rb_node);
			BUG_ON(offset_in_entry(test, offset));
		}
		node = prev;
	}
985
	for (; node; node = rb_prev(node)) {
986
		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
987 988 989 990

		/* We treat this entry as if it doesnt exist */
		if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
			continue;
991 992
		if (test->file_offset + test->len <= disk_i_size)
			break;
993
		if (test->file_offset >= i_size)
994
			break;
995
		if (entry_end(test) > disk_i_size) {
996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007
			/*
			 * we don't update disk_i_size now, so record this
			 * undealt i_size. Or we will not know the real
			 * i_size.
			 */
			if (test->outstanding_isize < offset)
				test->outstanding_isize = offset;
			if (ordered &&
			    ordered->outstanding_isize >
			    test->outstanding_isize)
				test->outstanding_isize =
						ordered->outstanding_isize;
1008
			goto out;
1009
		}
1010
	}
1011
	new_i_size = min_t(u64, offset, i_size);
1012 1013

	/*
1014 1015
	 * Some ordered extents may completed before the current one, and
	 * we hold the real i_size in ->outstanding_isize.
1016
	 */
1017 1018
	if (ordered && ordered->outstanding_isize > new_i_size)
		new_i_size = min_t(u64, ordered->outstanding_isize, i_size);
1019
	BTRFS_I(inode)->disk_i_size = new_i_size;
1020
	ret = 0;
1021
out:
1022
	/*
1023 1024 1025 1026 1027
	 * We need to do this because we can't remove ordered extents until
	 * after the i_disk_size has been updated and then the inode has been
	 * updated to reflect the change, so we need to tell anybody who finds
	 * this ordered extent that we've already done all the real work, we
	 * just haven't completed all the other work.
1028 1029
	 */
	if (ordered)
1030 1031
		set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags);
	spin_unlock_irq(&tree->lock);
1032
	return ret;
1033
}
1034

1035 1036 1037 1038 1039
/*
 * search the ordered extents for one corresponding to 'offset' and
 * try to find a checksum.  This is used because we allow pages to
 * be reclaimed before their checksum is actually put into the btree
 */
1040
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
1041
			   u32 *sum, int len)
1042 1043 1044 1045
{
	struct btrfs_ordered_sum *ordered_sum;
	struct btrfs_ordered_extent *ordered;
	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
1046 1047 1048
	unsigned long num_sectors;
	unsigned long i;
	u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
1049
	int index = 0;
1050 1051 1052

	ordered = btrfs_lookup_ordered_extent(inode, offset);
	if (!ordered)
1053
		return 0;
1054

1055
	spin_lock_irq(&tree->lock);
Q
Qinghuang Feng 已提交
1056
	list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
1057 1058 1059 1060 1061 1062
		if (disk_bytenr >= ordered_sum->bytenr &&
		    disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
			i = (disk_bytenr - ordered_sum->bytenr) >>
			    inode->i_sb->s_blocksize_bits;
			num_sectors = ordered_sum->len >>
				      inode->i_sb->s_blocksize_bits;
1063 1064 1065 1066 1067 1068 1069 1070
			num_sectors = min_t(int, len - index, num_sectors - i);
			memcpy(sum + index, ordered_sum->sums + i,
			       num_sectors);

			index += (int)num_sectors;
			if (index == len)
				goto out;
			disk_bytenr += num_sectors * sectorsize;
1071 1072 1073
		}
	}
out:
1074
	spin_unlock_irq(&tree->lock);
1075
	btrfs_put_ordered_extent(ordered);
1076
	return index;
1077 1078
}

1079

1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091
/*
 * add a given inode to the list of inodes that must be fully on
 * disk before a transaction commit finishes.
 *
 * This basically gives us the ext3 style data=ordered mode, and it is mostly
 * used to make sure renamed files are fully on disk.
 *
 * It is a noop if the inode is already fully on disk.
 *
 * If trans is not null, we'll do a friendly check for a transaction that
 * is already flushing things and force the IO down ourselves.
 */
1092 1093
void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root, struct inode *inode)
1094
{
1095
	struct btrfs_transaction *cur_trans = trans->transaction;
1096 1097 1098 1099 1100 1101 1102 1103
	u64 last_mod;

	last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);

	/*
	 * if this file hasn't been changed since the last transaction
	 * commit, we can safely return without doing anything
	 */
1104
	if (last_mod <= root->fs_info->last_trans_committed)
1105
		return;
1106

1107
	spin_lock(&root->fs_info->ordered_root_lock);
1108 1109
	if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
		list_add_tail(&BTRFS_I(inode)->ordered_operations,
1110
			      &cur_trans->ordered_operations);
1111
	}
1112
	spin_unlock(&root->fs_info->ordered_root_lock);
1113
}
1114 1115 1116 1117 1118 1119 1120 1121 1122

int __init ordered_data_init(void)
{
	btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
				     sizeof(struct btrfs_ordered_extent), 0,
				     SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
				     NULL);
	if (!btrfs_ordered_extent_cache)
		return -ENOMEM;
1123

1124 1125 1126 1127 1128 1129 1130 1131
	return 0;
}

void ordered_data_exit(void)
{
	if (btrfs_ordered_extent_cache)
		kmem_cache_destroy(btrfs_ordered_extent_cache);
}