transaction.c 39.5 KB
Newer Older
C
Chris Mason 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

C
Chris Mason 已提交
19
#include <linux/fs.h>
20
#include <linux/slab.h>
C
Chris Mason 已提交
21
#include <linux/sched.h>
22
#include <linux/writeback.h>
23
#include <linux/pagemap.h>
24
#include <linux/blkdev.h>
C
Chris Mason 已提交
25 26 27
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
28
#include "locking.h"
29
#include "tree-log.h"
30
#include "inode-map.h"
C
Chris Mason 已提交
31

32 33
#define BTRFS_ROOT_TRANS_TAG 0

34
void put_transaction(struct btrfs_transaction *transaction)
C
Chris Mason 已提交
35
{
36 37
	WARN_ON(atomic_read(&transaction->use_count) == 0);
	if (atomic_dec_and_test(&transaction->use_count)) {
J
Josef Bacik 已提交
38
		BUG_ON(!list_empty(&transaction->list));
39 40
		WARN_ON(transaction->delayed_refs.root.rb_node);
		WARN_ON(!list_empty(&transaction->delayed_refs.seq_head));
C
Chris Mason 已提交
41 42
		memset(transaction, 0, sizeof(*transaction));
		kmem_cache_free(btrfs_transaction_cachep, transaction);
C
Chris Mason 已提交
43
	}
C
Chris Mason 已提交
44 45
}

J
Josef Bacik 已提交
46 47 48 49 50 51
static noinline void switch_commit_root(struct btrfs_root *root)
{
	free_extent_buffer(root->commit_root);
	root->commit_root = btrfs_root_node(root);
}

C
Chris Mason 已提交
52 53 54
/*
 * either allocate a new transaction or hop into the existing one
 */
J
Josef Bacik 已提交
55
static noinline int join_transaction(struct btrfs_root *root, int nofail)
C
Chris Mason 已提交
56 57
{
	struct btrfs_transaction *cur_trans;
58
	struct btrfs_fs_info *fs_info = root->fs_info;
J
Josef Bacik 已提交
59

60
	spin_lock(&fs_info->trans_lock);
61
loop:
62
	/* The file system has been taken offline. No new transactions. */
63 64
	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
		spin_unlock(&fs_info->trans_lock);
65 66 67
		return -EROFS;
	}

68
	if (fs_info->trans_no_join) {
J
Josef Bacik 已提交
69
		if (!nofail) {
70
			spin_unlock(&fs_info->trans_lock);
J
Josef Bacik 已提交
71 72 73 74
			return -EBUSY;
		}
	}

75
	cur_trans = fs_info->running_transaction;
J
Josef Bacik 已提交
76
	if (cur_trans) {
77
		if (cur_trans->aborted) {
78
			spin_unlock(&fs_info->trans_lock);
79
			return cur_trans->aborted;
80
		}
J
Josef Bacik 已提交
81
		atomic_inc(&cur_trans->use_count);
82
		atomic_inc(&cur_trans->num_writers);
83
		cur_trans->num_joined++;
84
		spin_unlock(&fs_info->trans_lock);
J
Josef Bacik 已提交
85
		return 0;
C
Chris Mason 已提交
86
	}
87
	spin_unlock(&fs_info->trans_lock);
J
Josef Bacik 已提交
88 89 90 91

	cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
	if (!cur_trans)
		return -ENOMEM;
92

93 94
	spin_lock(&fs_info->trans_lock);
	if (fs_info->running_transaction) {
95 96 97 98
		/*
		 * someone started a transaction after we unlocked.  Make sure
		 * to redo the trans_no_join checks above
		 */
J
Josef Bacik 已提交
99
		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
100
		cur_trans = fs_info->running_transaction;
101
		goto loop;
C
Chris Mason 已提交
102
	}
103

J
Josef Bacik 已提交
104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
	atomic_set(&cur_trans->num_writers, 1);
	cur_trans->num_joined = 0;
	init_waitqueue_head(&cur_trans->writer_wait);
	init_waitqueue_head(&cur_trans->commit_wait);
	cur_trans->in_commit = 0;
	cur_trans->blocked = 0;
	/*
	 * One for this trans handle, one so it will live on until we
	 * commit the transaction.
	 */
	atomic_set(&cur_trans->use_count, 2);
	cur_trans->commit_done = 0;
	cur_trans->start_time = get_seconds();

	cur_trans->delayed_refs.root = RB_ROOT;
	cur_trans->delayed_refs.num_entries = 0;
	cur_trans->delayed_refs.num_heads_ready = 0;
	cur_trans->delayed_refs.num_heads = 0;
	cur_trans->delayed_refs.flushing = 0;
	cur_trans->delayed_refs.run_delayed_start = 0;
124
	cur_trans->delayed_refs.seq = 1;
125
	init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
J
Josef Bacik 已提交
126 127
	spin_lock_init(&cur_trans->commit_lock);
	spin_lock_init(&cur_trans->delayed_refs.lock);
128
	INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
J
Josef Bacik 已提交
129 130

	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
131
	list_add_tail(&cur_trans->list, &fs_info->trans_list);
J
Josef Bacik 已提交
132
	extent_io_tree_init(&cur_trans->dirty_pages,
133 134 135 136
			     fs_info->btree_inode->i_mapping);
	fs_info->generation++;
	cur_trans->transid = fs_info->generation;
	fs_info->running_transaction = cur_trans;
137
	cur_trans->aborted = 0;
138
	spin_unlock(&fs_info->trans_lock);
139

C
Chris Mason 已提交
140 141 142
	return 0;
}

C
Chris Mason 已提交
143
/*
C
Chris Mason 已提交
144 145 146 147
 * this does all the record keeping required to make sure that a reference
 * counted root is properly recorded in a given transaction.  This is required
 * to make sure the old root from before we joined the transaction is deleted
 * when the transaction commits
C
Chris Mason 已提交
148
 */
C
Chris Mason 已提交
149
static int record_root_in_trans(struct btrfs_trans_handle *trans,
J
Josef Bacik 已提交
150
			       struct btrfs_root *root)
151
{
152
	if (root->ref_cows && root->last_trans < trans->transid) {
153
		WARN_ON(root == root->fs_info->extent_root);
154 155
		WARN_ON(root->commit_root != root->node);

C
Chris Mason 已提交
156 157 158 159 160 161 162 163 164 165 166 167
		/*
		 * see below for in_trans_setup usage rules
		 * we have the reloc mutex held now, so there
		 * is only one writer in this function
		 */
		root->in_trans_setup = 1;

		/* make sure readers find in_trans_setup before
		 * they find our root->last_trans update
		 */
		smp_wmb();

J
Josef Bacik 已提交
168 169 170 171 172
		spin_lock(&root->fs_info->fs_roots_radix_lock);
		if (root->last_trans == trans->transid) {
			spin_unlock(&root->fs_info->fs_roots_radix_lock);
			return 0;
		}
173 174 175
		radix_tree_tag_set(&root->fs_info->fs_roots_radix,
			   (unsigned long)root->root_key.objectid,
			   BTRFS_ROOT_TRANS_TAG);
J
Josef Bacik 已提交
176
		spin_unlock(&root->fs_info->fs_roots_radix_lock);
C
Chris Mason 已提交
177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
		root->last_trans = trans->transid;

		/* this is pretty tricky.  We don't want to
		 * take the relocation lock in btrfs_record_root_in_trans
		 * unless we're really doing the first setup for this root in
		 * this transaction.
		 *
		 * Normally we'd use root->last_trans as a flag to decide
		 * if we want to take the expensive mutex.
		 *
		 * But, we have to set root->last_trans before we
		 * init the relocation root, otherwise, we trip over warnings
		 * in ctree.c.  The solution used here is to flag ourselves
		 * with root->in_trans_setup.  When this is 1, we're still
		 * fixing up the reloc trees and everyone must wait.
		 *
		 * When this is zero, they can trust root->last_trans and fly
		 * through btrfs_record_root_in_trans without having to take the
		 * lock.  smp_wmb() makes sure that all the writes above are
		 * done before we pop in the zero below
		 */
198
		btrfs_init_reloc_root(trans, root);
C
Chris Mason 已提交
199 200
		smp_wmb();
		root->in_trans_setup = 0;
201 202 203
	}
	return 0;
}
204

C
Chris Mason 已提交
205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227

int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root)
{
	if (!root->ref_cows)
		return 0;

	/*
	 * see record_root_in_trans for comments about in_trans_setup usage
	 * and barriers
	 */
	smp_rmb();
	if (root->last_trans == trans->transid &&
	    !root->in_trans_setup)
		return 0;

	mutex_lock(&root->fs_info->reloc_mutex);
	record_root_in_trans(trans, root);
	mutex_unlock(&root->fs_info->reloc_mutex);

	return 0;
}

C
Chris Mason 已提交
228 229 230 231
/* wait for commit against the current transaction to become unblocked
 * when this is done, it is safe to start a new transaction, but the current
 * transaction might not be fully on disk.
 */
C
Chris Mason 已提交
232
static void wait_current_trans(struct btrfs_root *root)
C
Chris Mason 已提交
233
{
234
	struct btrfs_transaction *cur_trans;
C
Chris Mason 已提交
235

J
Josef Bacik 已提交
236
	spin_lock(&root->fs_info->trans_lock);
237
	cur_trans = root->fs_info->running_transaction;
C
Chris Mason 已提交
238
	if (cur_trans && cur_trans->blocked) {
239
		atomic_inc(&cur_trans->use_count);
J
Josef Bacik 已提交
240
		spin_unlock(&root->fs_info->trans_lock);
L
Li Zefan 已提交
241 242 243

		wait_event(root->fs_info->transaction_wait,
			   !cur_trans->blocked);
244
		put_transaction(cur_trans);
J
Josef Bacik 已提交
245 246
	} else {
		spin_unlock(&root->fs_info->trans_lock);
247
	}
C
Chris Mason 已提交
248 249
}

250 251 252 253
enum btrfs_trans_type {
	TRANS_START,
	TRANS_JOIN,
	TRANS_USERSPACE,
254
	TRANS_JOIN_NOLOCK,
255 256
};

257 258
static int may_wait_transaction(struct btrfs_root *root, int type)
{
J
Josef Bacik 已提交
259 260 261 262 263 264 265 266
	if (root->fs_info->log_root_recovering)
		return 0;

	if (type == TRANS_USERSPACE)
		return 1;

	if (type == TRANS_START &&
	    !atomic_read(&root->fs_info->open_ioctl_trans))
267
		return 1;
J
Josef Bacik 已提交
268

269 270 271
	return 0;
}

272
static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
273
						    u64 num_items, int type)
C
Chris Mason 已提交
274
{
275 276
	struct btrfs_trans_handle *h;
	struct btrfs_transaction *cur_trans;
277
	u64 num_bytes = 0;
C
Chris Mason 已提交
278
	int ret;
L
liubo 已提交
279 280 281

	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
		return ERR_PTR(-EROFS);
282 283 284 285 286 287 288 289 290

	if (current->journal_info) {
		WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
		h = current->journal_info;
		h->use_count++;
		h->orig_rsv = h->block_rsv;
		h->block_rsv = NULL;
		goto got_it;
	}
291 292 293 294 295 296 297

	/*
	 * Do the reservation before we join the transaction so we can do all
	 * the appropriate flushing if need be.
	 */
	if (num_items > 0 && root != root->fs_info->chunk_root) {
		num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
298
		ret = btrfs_block_rsv_add(root,
299 300 301 302 303
					  &root->fs_info->trans_block_rsv,
					  num_bytes);
		if (ret)
			return ERR_PTR(ret);
	}
304 305 306 307
again:
	h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
	if (!h)
		return ERR_PTR(-ENOMEM);
C
Chris Mason 已提交
308

309
	if (may_wait_transaction(root, type))
C
Chris Mason 已提交
310
		wait_current_trans(root);
311

J
Josef Bacik 已提交
312 313 314 315 316 317
	do {
		ret = join_transaction(root, type == TRANS_JOIN_NOLOCK);
		if (ret == -EBUSY)
			wait_current_trans(root);
	} while (ret == -EBUSY);

T
Tsutomu Itoh 已提交
318
	if (ret < 0) {
319
		kmem_cache_free(btrfs_trans_handle_cachep, h);
T
Tsutomu Itoh 已提交
320 321
		return ERR_PTR(ret);
	}
322

323 324 325 326
	cur_trans = root->fs_info->running_transaction;

	h->transid = cur_trans->transid;
	h->transaction = cur_trans;
C
Chris Mason 已提交
327
	h->blocks_used = 0;
328
	h->bytes_reserved = 0;
329
	h->delayed_ref_updates = 0;
330
	h->use_count = 1;
331
	h->block_rsv = NULL;
332
	h->orig_rsv = NULL;
333
	h->aborted = 0;
334

335 336 337 338 339 340
	smp_mb();
	if (cur_trans->blocked && may_wait_transaction(root, type)) {
		btrfs_commit_transaction(h, root);
		goto again;
	}

341
	if (num_bytes) {
J
Josef Bacik 已提交
342
		trace_btrfs_space_reservation(root->fs_info, "transaction",
343
					      h->transid, num_bytes, 1);
344 345
		h->block_rsv = &root->fs_info->trans_block_rsv;
		h->bytes_reserved = num_bytes;
346
	}
J
Josef Bacik 已提交
347

348
got_it:
J
Josef Bacik 已提交
349
	btrfs_record_root_in_trans(h, root);
350 351 352

	if (!current->journal_info && type != TRANS_USERSPACE)
		current->journal_info = h;
C
Chris Mason 已提交
353 354 355
	return h;
}

356
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
357
						   int num_items)
358
{
359
	return start_transaction(root, num_items, TRANS_START);
360
}
361
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
362
{
363
	return start_transaction(root, 0, TRANS_JOIN);
364 365
}

366
struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
367 368 369 370
{
	return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
}

371
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
372
{
373
	return start_transaction(root, 0, TRANS_USERSPACE);
374 375
}

C
Chris Mason 已提交
376
/* wait for a transaction commit to be fully complete */
377
static noinline void wait_for_commit(struct btrfs_root *root,
378 379
				    struct btrfs_transaction *commit)
{
L
Li Zefan 已提交
380
	wait_event(commit->commit_wait, commit->commit_done);
381 382
}

383 384 385 386 387 388 389 390
int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
{
	struct btrfs_transaction *cur_trans = NULL, *t;
	int ret;

	ret = 0;
	if (transid) {
		if (transid <= root->fs_info->last_trans_committed)
J
Josef Bacik 已提交
391
			goto out;
392 393

		/* find specified transaction */
J
Josef Bacik 已提交
394
		spin_lock(&root->fs_info->trans_lock);
395 396 397
		list_for_each_entry(t, &root->fs_info->trans_list, list) {
			if (t->transid == transid) {
				cur_trans = t;
J
Josef Bacik 已提交
398
				atomic_inc(&cur_trans->use_count);
399 400 401 402 403
				break;
			}
			if (t->transid > transid)
				break;
		}
J
Josef Bacik 已提交
404
		spin_unlock(&root->fs_info->trans_lock);
405 406
		ret = -EINVAL;
		if (!cur_trans)
J
Josef Bacik 已提交
407
			goto out;  /* bad transid */
408 409
	} else {
		/* find newest transaction that is committing | committed */
J
Josef Bacik 已提交
410
		spin_lock(&root->fs_info->trans_lock);
411 412 413 414
		list_for_each_entry_reverse(t, &root->fs_info->trans_list,
					    list) {
			if (t->in_commit) {
				if (t->commit_done)
415
					break;
416
				cur_trans = t;
J
Josef Bacik 已提交
417
				atomic_inc(&cur_trans->use_count);
418 419 420
				break;
			}
		}
J
Josef Bacik 已提交
421
		spin_unlock(&root->fs_info->trans_lock);
422
		if (!cur_trans)
J
Josef Bacik 已提交
423
			goto out;  /* nothing committing|committed */
424 425 426 427 428 429
	}

	wait_for_commit(root, cur_trans);

	put_transaction(cur_trans);
	ret = 0;
J
Josef Bacik 已提交
430
out:
431 432 433
	return ret;
}

C
Chris Mason 已提交
434 435
void btrfs_throttle(struct btrfs_root *root)
{
J
Josef Bacik 已提交
436
	if (!atomic_read(&root->fs_info->open_ioctl_trans))
437
		wait_current_trans(root);
C
Chris Mason 已提交
438 439
}

440 441 442 443
static int should_end_transaction(struct btrfs_trans_handle *trans,
				  struct btrfs_root *root)
{
	int ret;
444 445

	ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
446 447 448 449 450 451 452
	return ret ? 1 : 0;
}

int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root)
{
	struct btrfs_transaction *cur_trans = trans->transaction;
453
	struct btrfs_block_rsv *rsv = trans->block_rsv;
454
	int updates;
455
	int err;
456

J
Josef Bacik 已提交
457
	smp_mb();
458 459 460
	if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
		return 1;

461 462 463 464 465 466
	/*
	 * We need to do this in case we're deleting csums so the global block
	 * rsv get's used instead of the csum block rsv.
	 */
	trans->block_rsv = NULL;

467 468
	updates = trans->delayed_ref_updates;
	trans->delayed_ref_updates = 0;
469 470 471 472 473
	if (updates) {
		err = btrfs_run_delayed_refs(trans, root, updates);
		if (err) /* Error code will also eval true */
			return err;
	}
474

475 476
	trans->block_rsv = rsv;

477 478 479
	return should_end_transaction(trans, root);
}

480
static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
481
			  struct btrfs_root *root, int throttle, int lock)
C
Chris Mason 已提交
482
{
483
	struct btrfs_transaction *cur_trans = trans->transaction;
484
	struct btrfs_fs_info *info = root->fs_info;
485
	int count = 0;
486
	int err = 0;
487

488 489 490 491 492
	if (--trans->use_count) {
		trans->block_rsv = trans->orig_rsv;
		return 0;
	}

493
	btrfs_trans_release_metadata(trans, root);
494
	trans->block_rsv = NULL;
495
	while (count < 2) {
496 497 498 499 500 501 502 503 504 505
		unsigned long cur = trans->delayed_ref_updates;
		trans->delayed_ref_updates = 0;
		if (cur &&
		    trans->transaction->delayed_refs.num_heads_ready > 64) {
			trans->delayed_ref_updates = 0;
			btrfs_run_delayed_refs(trans, root, cur);
		} else {
			break;
		}
		count++;
506 507
	}

J
Josef Bacik 已提交
508 509
	if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
	    should_end_transaction(trans, root)) {
510
		trans->transaction->blocked = 1;
J
Josef Bacik 已提交
511 512
		smp_wmb();
	}
513

514
	if (lock && cur_trans->blocked && !cur_trans->in_commit) {
515 516 517 518 519 520 521
		if (throttle) {
			/*
			 * We may race with somebody else here so end up having
			 * to call end_transaction on ourselves again, so inc
			 * our use_count.
			 */
			trans->use_count++;
522
			return btrfs_commit_transaction(trans, root);
523
		} else {
524
			wake_up_process(info->transaction_kthread);
525
		}
526 527 528
	}

	WARN_ON(cur_trans != info->running_transaction);
529 530
	WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
	atomic_dec(&cur_trans->num_writers);
531

532
	smp_mb();
C
Chris Mason 已提交
533 534 535
	if (waitqueue_active(&cur_trans->writer_wait))
		wake_up(&cur_trans->writer_wait);
	put_transaction(cur_trans);
J
Josef Bacik 已提交
536 537 538

	if (current->journal_info == trans)
		current->journal_info = NULL;
539

Y
Yan, Zheng 已提交
540 541 542
	if (throttle)
		btrfs_run_delayed_iputs(root);

543 544
	if (trans->aborted ||
	    root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
545
		err = -EIO;
546 547
	}

548 549 550
	memset(trans, 0, sizeof(*trans));
	kmem_cache_free(btrfs_trans_handle_cachep, trans);
	return err;
C
Chris Mason 已提交
551 552
}

553 554 555
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root)
{
556 557 558 559 560 561
	int ret;

	ret = __btrfs_end_transaction(trans, root, 0, 1);
	if (ret)
		return ret;
	return 0;
562 563 564 565 566
}

int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root)
{
567 568 569 570 571 572
	int ret;

	ret = __btrfs_end_transaction(trans, root, 1, 1);
	if (ret)
		return ret;
	return 0;
573 574 575 576 577
}

int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root)
{
578 579 580 581 582 583 584 585 586 587 588 589
	int ret;

	ret = __btrfs_end_transaction(trans, root, 0, 0);
	if (ret)
		return ret;
	return 0;
}

int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
				struct btrfs_root *root)
{
	return __btrfs_end_transaction(trans, root, 1, 1);
590 591
}

C
Chris Mason 已提交
592 593 594
/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
595
 * those extents are sent to disk but does not wait on them
C
Chris Mason 已提交
596
 */
597
int btrfs_write_marked_extents(struct btrfs_root *root,
598
			       struct extent_io_tree *dirty_pages, int mark)
C
Chris Mason 已提交
599
{
600
	int err = 0;
601
	int werr = 0;
J
Josef Bacik 已提交
602
	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
603
	u64 start = 0;
604
	u64 end;
605

J
Josef Bacik 已提交
606 607 608 609 610 611 612 613 614
	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
				      mark)) {
		convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
				   GFP_NOFS);
		err = filemap_fdatawrite_range(mapping, start, end);
		if (err)
			werr = err;
		cond_resched();
		start = end + 1;
615
	}
616 617 618 619 620 621 622 623 624 625 626 627
	if (err)
		werr = err;
	return werr;
}

/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
 * those extents are on disk for transaction or log commit.  We wait
 * on all the pages and clear them from the dirty pages state tree
 */
int btrfs_wait_marked_extents(struct btrfs_root *root,
628
			      struct extent_io_tree *dirty_pages, int mark)
629 630 631
{
	int err = 0;
	int werr = 0;
J
Josef Bacik 已提交
632
	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
633 634
	u64 start = 0;
	u64 end;
635

J
Josef Bacik 已提交
636 637 638 639 640 641 642 643
	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
				      EXTENT_NEED_WAIT)) {
		clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
		err = filemap_fdatawait_range(mapping, start, end);
		if (err)
			werr = err;
		cond_resched();
		start = end + 1;
644
	}
645 646 647
	if (err)
		werr = err;
	return werr;
C
Chris Mason 已提交
648 649
}

650 651 652 653 654 655
/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
 * those extents are on disk for transaction or log commit
 */
int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
656
				struct extent_io_tree *dirty_pages, int mark)
657 658 659 660
{
	int ret;
	int ret2;

661 662
	ret = btrfs_write_marked_extents(root, dirty_pages, mark);
	ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
663 664 665 666 667 668

	if (ret)
		return ret;
	if (ret2)
		return ret2;
	return 0;
669 670
}

671 672 673 674 675 676 677 678 679
int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
				     struct btrfs_root *root)
{
	if (!trans || !trans->transaction) {
		struct inode *btree_inode;
		btree_inode = root->fs_info->btree_inode;
		return filemap_write_and_wait(btree_inode->i_mapping);
	}
	return btrfs_write_and_wait_marked_extents(root,
680 681
					   &trans->transaction->dirty_pages,
					   EXTENT_DIRTY);
682 683
}

C
Chris Mason 已提交
684 685 686 687 688 689 690 691 692 693
/*
 * this is used to update the root pointer in the tree of tree roots.
 *
 * But, in the case of the extent allocation tree, updating the root
 * pointer may allocate blocks which may change the root of the extent
 * allocation tree.
 *
 * So, this loops and repeats and makes sure the cowonly root didn't
 * change while the root pointer was being updated in the metadata.
 */
694 695
static int update_cowonly_root(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root)
C
Chris Mason 已提交
696 697
{
	int ret;
698
	u64 old_root_bytenr;
699
	u64 old_root_used;
700
	struct btrfs_root *tree_root = root->fs_info->tree_root;
C
Chris Mason 已提交
701

702
	old_root_used = btrfs_root_used(&root->root_item);
703
	btrfs_write_dirty_block_groups(trans, root);
704

C
Chris Mason 已提交
705
	while (1) {
706
		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
707 708
		if (old_root_bytenr == root->node->start &&
		    old_root_used == btrfs_root_used(&root->root_item))
C
Chris Mason 已提交
709
			break;
710

711
		btrfs_set_root_node(&root->root_item, root->node);
C
Chris Mason 已提交
712
		ret = btrfs_update_root(trans, tree_root,
713 714
					&root->root_key,
					&root->root_item);
715 716
		if (ret)
			return ret;
717

718
		old_root_used = btrfs_root_used(&root->root_item);
719
		ret = btrfs_write_dirty_block_groups(trans, root);
720 721
		if (ret)
			return ret;
722
	}
723 724 725 726

	if (root != root->fs_info->extent_root)
		switch_commit_root(root);

727 728 729
	return 0;
}

C
Chris Mason 已提交
730 731
/*
 * update all the cowonly tree roots on disk
732 733 734 735
 *
 * The error handling in this function may not be obvious. Any of the
 * failures will cause the file system to go offline. We still need
 * to clean up the delayed refs.
C
Chris Mason 已提交
736
 */
737 738
static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
					 struct btrfs_root *root)
739 740 741
{
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct list_head *next;
742
	struct extent_buffer *eb;
743
	int ret;
744

745
	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
746 747
	if (ret)
		return ret;
748

749
	eb = btrfs_lock_root_node(fs_info->tree_root);
750 751
	ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
			      0, &eb);
752 753
	btrfs_tree_unlock(eb);
	free_extent_buffer(eb);
754

755 756 757
	if (ret)
		return ret;

758
	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
759 760
	if (ret)
		return ret;
761

C
Chris Mason 已提交
762
	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
763 764 765
		next = fs_info->dirty_cowonly_roots.next;
		list_del_init(next);
		root = list_entry(next, struct btrfs_root, dirty_list);
766

767 768 769
		ret = update_cowonly_root(trans, root);
		if (ret)
			return ret;
C
Chris Mason 已提交
770
	}
771 772 773 774 775

	down_write(&fs_info->extent_commit_sem);
	switch_commit_root(fs_info->extent_root);
	up_write(&fs_info->extent_commit_sem);

C
Chris Mason 已提交
776 777 778
	return 0;
}

C
Chris Mason 已提交
779 780 781 782 783
/*
 * dead roots are old snapshots that need to be deleted.  This allocates
 * a dirty root struct and adds it into the list of dead roots that need to
 * be deleted
 */
784
int btrfs_add_dead_root(struct btrfs_root *root)
785
{
J
Josef Bacik 已提交
786
	spin_lock(&root->fs_info->trans_lock);
787
	list_add(&root->root_list, &root->fs_info->dead_roots);
J
Josef Bacik 已提交
788
	spin_unlock(&root->fs_info->trans_lock);
789 790 791
	return 0;
}

C
Chris Mason 已提交
792
/*
793
 * update all the cowonly tree roots on disk
C
Chris Mason 已提交
794
 */
795 796
static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
				    struct btrfs_root *root)
797 798
{
	struct btrfs_root *gang[8];
799
	struct btrfs_fs_info *fs_info = root->fs_info;
800 801
	int i;
	int ret;
802 803
	int err = 0;

J
Josef Bacik 已提交
804
	spin_lock(&fs_info->fs_roots_radix_lock);
C
Chris Mason 已提交
805
	while (1) {
806 807
		ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
						 (void **)gang, 0,
808 809 810 811 812 813
						 ARRAY_SIZE(gang),
						 BTRFS_ROOT_TRANS_TAG);
		if (ret == 0)
			break;
		for (i = 0; i < ret; i++) {
			root = gang[i];
814 815 816
			radix_tree_tag_clear(&fs_info->fs_roots_radix,
					(unsigned long)root->root_key.objectid,
					BTRFS_ROOT_TRANS_TAG);
J
Josef Bacik 已提交
817
			spin_unlock(&fs_info->fs_roots_radix_lock);
Y
Yan Zheng 已提交
818

819
			btrfs_free_log(trans, root);
820
			btrfs_update_reloc_root(trans, root);
821
			btrfs_orphan_commit_root(trans, root);
822

823 824
			btrfs_save_ino_cache(root, trans);

825 826 827 828
			/* see comments in should_cow_block() */
			root->force_cow = 0;
			smp_wmb();

829
			if (root->commit_root != root->node) {
830
				mutex_lock(&root->fs_commit_mutex);
J
Josef Bacik 已提交
831
				switch_commit_root(root);
832 833 834
				btrfs_unpin_free_ino(root);
				mutex_unlock(&root->fs_commit_mutex);

835 836 837
				btrfs_set_root_node(&root->root_item,
						    root->node);
			}
838 839

			err = btrfs_update_root(trans, fs_info->tree_root,
840 841
						&root->root_key,
						&root->root_item);
J
Josef Bacik 已提交
842
			spin_lock(&fs_info->fs_roots_radix_lock);
843 844
			if (err)
				break;
845 846
		}
	}
J
Josef Bacik 已提交
847
	spin_unlock(&fs_info->fs_roots_radix_lock);
848
	return err;
849 850
}

C
Chris Mason 已提交
851 852 853 854
/*
 * defrag a given btree.  If cacheonly == 1, this won't read from the disk,
 * otherwise every leaf in the btree is read and defragged.
 */
855 856 857 858
int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
{
	struct btrfs_fs_info *info = root->fs_info;
	struct btrfs_trans_handle *trans;
859
	int ret;
860
	unsigned long nr;
861

862
	if (xchg(&root->defrag_running, 1))
863
		return 0;
864

865
	while (1) {
866 867 868 869
		trans = btrfs_start_transaction(root, 0);
		if (IS_ERR(trans))
			return PTR_ERR(trans);

870
		ret = btrfs_defrag_leaves(trans, root, cacheonly);
871

872
		nr = trans->blocks_used;
873
		btrfs_end_transaction(trans, root);
874
		btrfs_btree_balance_dirty(info->tree_root, nr);
875 876
		cond_resched();

877
		if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
878 879 880
			break;
	}
	root->defrag_running = 0;
881
	return ret;
882 883
}

C
Chris Mason 已提交
884 885 886 887
/*
 * new snapshots need to be created at a very specific time in the
 * transaction commit.  This does the actual creation
 */
888
static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
889 890 891 892
				   struct btrfs_fs_info *fs_info,
				   struct btrfs_pending_snapshot *pending)
{
	struct btrfs_key key;
893
	struct btrfs_root_item *new_root_item;
894 895
	struct btrfs_root *tree_root = fs_info->tree_root;
	struct btrfs_root *root = pending->root;
896
	struct btrfs_root *parent_root;
L
Liu Bo 已提交
897
	struct btrfs_block_rsv *rsv;
898
	struct inode *parent_inode;
899
	struct dentry *parent;
900
	struct dentry *dentry;
901
	struct extent_buffer *tmp;
902
	struct extent_buffer *old;
903
	int ret;
904
	u64 to_reserve = 0;
905
	u64 index = 0;
906
	u64 objectid;
L
Li Zefan 已提交
907
	u64 root_flags;
908

L
Liu Bo 已提交
909 910
	rsv = trans->block_rsv;

911 912
	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
	if (!new_root_item) {
913
		ret = pending->error = -ENOMEM;
914 915
		goto fail;
	}
916

917
	ret = btrfs_find_free_objectid(tree_root, &objectid);
918 919
	if (ret) {
		pending->error = ret;
920
		goto fail;
921
	}
922

923
	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
924 925

	if (to_reserve > 0) {
926 927
		ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv,
						  to_reserve);
928 929 930 931 932 933
		if (ret) {
			pending->error = ret;
			goto fail;
		}
	}

934
	key.objectid = objectid;
935 936
	key.offset = (u64)-1;
	key.type = BTRFS_ROOT_ITEM_KEY;
937

938
	trans->block_rsv = &pending->block_rsv;
939

940
	dentry = pending->dentry;
941 942
	parent = dget_parent(dentry);
	parent_inode = parent->d_inode;
943
	parent_root = BTRFS_I(parent_inode)->root;
C
Chris Mason 已提交
944
	record_root_in_trans(trans, parent_root);
945

946 947 948
	/*
	 * insert the directory item
	 */
949
	ret = btrfs_set_inode_index(parent_inode, &index);
950
	BUG_ON(ret); /* -ENOMEM */
951
	ret = btrfs_insert_dir_item(trans, parent_root,
952
				dentry->d_name.name, dentry->d_name.len,
953
				parent_inode, &key,
954
				BTRFS_FT_DIR, index);
955
	if (ret == -EEXIST) {
956 957 958
		pending->error = -EEXIST;
		dput(parent);
		goto fail;
959 960 961
	} else if (ret) {
		goto abort_trans_dput;
	}
962

963 964
	btrfs_i_size_write(parent_inode, parent_inode->i_size +
					 dentry->d_name.len * 2);
965
	ret = btrfs_update_inode(trans, parent_root, parent_inode);
966
	if (ret)
967
		goto abort_trans_dput;
968

969 970 971 972 973 974 975
	/*
	 * pull in the delayed directory update
	 * and the delayed inode item
	 * otherwise we corrupt the FS during
	 * snapshot
	 */
	ret = btrfs_run_delayed_items(trans, root);
976 977
	if (ret) { /* Transaction aborted */
		dput(parent);
978
		goto fail;
979
	}
980

C
Chris Mason 已提交
981
	record_root_in_trans(trans, root);
982 983
	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
984
	btrfs_check_and_init_root_item(new_root_item);
985

L
Li Zefan 已提交
986 987 988 989 990 991 992
	root_flags = btrfs_root_flags(new_root_item);
	if (pending->readonly)
		root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
	else
		root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
	btrfs_set_root_flags(new_root_item, root_flags);

993
	old = btrfs_lock_root_node(root);
994
	ret = btrfs_cow_block(trans, root, old, NULL, 0, &old);
995 996 997 998 999
	if (ret) {
		btrfs_tree_unlock(old);
		free_extent_buffer(old);
		goto abort_trans_dput;
	}
1000

1001 1002
	btrfs_set_lock_blocking(old);

1003
	ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
1004
	/* clean up in any case */
1005 1006
	btrfs_tree_unlock(old);
	free_extent_buffer(old);
1007 1008
	if (ret)
		goto abort_trans_dput;
1009

1010 1011 1012 1013
	/* see comments in should_cow_block() */
	root->force_cow = 1;
	smp_wmb();

1014
	btrfs_set_root_node(new_root_item, tmp);
1015 1016 1017
	/* record when the snapshot was created in key.offset */
	key.offset = trans->transid;
	ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
1018 1019
	btrfs_tree_unlock(tmp);
	free_extent_buffer(tmp);
1020
	if (ret)
1021
		goto abort_trans_dput;
1022

1023 1024 1025 1026
	/*
	 * insert root back/forward references
	 */
	ret = btrfs_add_root_ref(trans, tree_root, objectid,
1027
				 parent_root->root_key.objectid,
L
Li Zefan 已提交
1028
				 btrfs_ino(parent_inode), index,
1029
				 dentry->d_name.name, dentry->d_name.len);
1030
	dput(parent);
1031 1032
	if (ret)
		goto fail;
1033

1034 1035
	key.offset = (u64)-1;
	pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
1036 1037
	if (IS_ERR(pending->snap)) {
		ret = PTR_ERR(pending->snap);
1038
		goto abort_trans;
1039
	}
1040

1041 1042 1043 1044
	ret = btrfs_reloc_post_snapshot(trans, pending);
	if (ret)
		goto abort_trans;
	ret = 0;
1045
fail:
1046
	kfree(new_root_item);
L
Liu Bo 已提交
1047
	trans->block_rsv = rsv;
1048
	btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
1049 1050
	return ret;

1051 1052
abort_trans_dput:
	dput(parent);
1053 1054 1055
abort_trans:
	btrfs_abort_transaction(trans, root, ret);
	goto fail;
1056 1057
}

C
Chris Mason 已提交
1058 1059 1060
/*
 * create all the snapshots we've scheduled for creation
 */
1061 1062
static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
					     struct btrfs_fs_info *fs_info)
1063 1064 1065 1066
{
	struct btrfs_pending_snapshot *pending;
	struct list_head *head = &trans->transaction->pending_snapshots;

1067 1068
	list_for_each_entry(pending, head, list)
		create_pending_snapshot(trans, fs_info, pending);
1069 1070 1071
	return 0;
}

1072 1073 1074 1075 1076
static void update_super_roots(struct btrfs_root *root)
{
	struct btrfs_root_item *root_item;
	struct btrfs_super_block *super;

1077
	super = root->fs_info->super_copy;
1078 1079 1080 1081 1082 1083 1084 1085 1086 1087

	root_item = &root->fs_info->chunk_root->root_item;
	super->chunk_root = root_item->bytenr;
	super->chunk_root_generation = root_item->generation;
	super->chunk_root_level = root_item->level;

	root_item = &root->fs_info->tree_root->root_item;
	super->root = root_item->bytenr;
	super->generation = root_item->generation;
	super->root_level = root_item->level;
1088
	if (btrfs_test_opt(root, SPACE_CACHE))
1089
		super->cache_generation = root_item->generation;
1090 1091
}

1092 1093 1094
int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
{
	int ret = 0;
J
Josef Bacik 已提交
1095
	spin_lock(&info->trans_lock);
1096 1097
	if (info->running_transaction)
		ret = info->running_transaction->in_commit;
J
Josef Bacik 已提交
1098
	spin_unlock(&info->trans_lock);
1099 1100 1101
	return ret;
}

1102 1103 1104
int btrfs_transaction_blocked(struct btrfs_fs_info *info)
{
	int ret = 0;
J
Josef Bacik 已提交
1105
	spin_lock(&info->trans_lock);
1106 1107
	if (info->running_transaction)
		ret = info->running_transaction->blocked;
J
Josef Bacik 已提交
1108
	spin_unlock(&info->trans_lock);
1109 1110 1111
	return ret;
}

S
Sage Weil 已提交
1112 1113 1114 1115 1116 1117 1118
/*
 * wait for the current transaction commit to start and block subsequent
 * transaction joins
 */
static void wait_current_trans_commit_start(struct btrfs_root *root,
					    struct btrfs_transaction *trans)
{
L
Li Zefan 已提交
1119
	wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit);
S
Sage Weil 已提交
1120 1121 1122 1123 1124 1125 1126 1127 1128
}

/*
 * wait for the current transaction to start and then become unblocked.
 * caller holds ref.
 */
static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
					 struct btrfs_transaction *trans)
{
L
Li Zefan 已提交
1129 1130
	wait_event(root->fs_info->transaction_wait,
		   trans->commit_done || (trans->in_commit && !trans->blocked));
S
Sage Weil 已提交
1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159
}

/*
 * commit transactions asynchronously. once btrfs_commit_transaction_async
 * returns, any subsequent transaction will not be allowed to join.
 */
struct btrfs_async_commit {
	struct btrfs_trans_handle *newtrans;
	struct btrfs_root *root;
	struct delayed_work work;
};

static void do_async_commit(struct work_struct *work)
{
	struct btrfs_async_commit *ac =
		container_of(work, struct btrfs_async_commit, work.work);

	btrfs_commit_transaction(ac->newtrans, ac->root);
	kfree(ac);
}

int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root,
				   int wait_for_unblock)
{
	struct btrfs_async_commit *ac;
	struct btrfs_transaction *cur_trans;

	ac = kmalloc(sizeof(*ac), GFP_NOFS);
T
Tsutomu Itoh 已提交
1160 1161
	if (!ac)
		return -ENOMEM;
S
Sage Weil 已提交
1162 1163 1164

	INIT_DELAYED_WORK(&ac->work, do_async_commit);
	ac->root = root;
1165
	ac->newtrans = btrfs_join_transaction(root);
1166 1167 1168 1169 1170
	if (IS_ERR(ac->newtrans)) {
		int err = PTR_ERR(ac->newtrans);
		kfree(ac);
		return err;
	}
S
Sage Weil 已提交
1171 1172 1173

	/* take transaction reference */
	cur_trans = trans->transaction;
1174
	atomic_inc(&cur_trans->use_count);
S
Sage Weil 已提交
1175 1176 1177 1178 1179 1180 1181 1182 1183 1184

	btrfs_end_transaction(trans, root);
	schedule_delayed_work(&ac->work, 0);

	/* wait for transaction to start and unblock */
	if (wait_for_unblock)
		wait_current_trans_commit_start_and_unblock(root, cur_trans);
	else
		wait_current_trans_commit_start(root, cur_trans);

1185 1186 1187 1188
	if (current->journal_info == trans)
		current->journal_info = NULL;

	put_transaction(cur_trans);
S
Sage Weil 已提交
1189 1190 1191
	return 0;
}

1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218

static void cleanup_transaction(struct btrfs_trans_handle *trans,
				struct btrfs_root *root)
{
	struct btrfs_transaction *cur_trans = trans->transaction;

	WARN_ON(trans->use_count > 1);

	spin_lock(&root->fs_info->trans_lock);
	list_del_init(&cur_trans->list);
	spin_unlock(&root->fs_info->trans_lock);

	btrfs_cleanup_one_transaction(trans->transaction, root);

	put_transaction(cur_trans);
	put_transaction(cur_trans);

	trace_btrfs_transaction_commit(root);

	btrfs_scrub_continue(root);

	if (current->journal_info == trans)
		current->journal_info = NULL;

	kmem_cache_free(btrfs_trans_handle_cachep, trans);
}

S
Sage Weil 已提交
1219 1220 1221 1222 1223 1224 1225
/*
 * btrfs_transaction state sequence:
 *    in_commit = 0, blocked = 0  (initial)
 *    in_commit = 1, blocked = 1
 *    blocked = 0
 *    commit_done = 1
 */
C
Chris Mason 已提交
1226 1227 1228
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
			     struct btrfs_root *root)
{
1229
	unsigned long joined = 0;
1230
	struct btrfs_transaction *cur_trans = trans->transaction;
C
Chris Mason 已提交
1231
	struct btrfs_transaction *prev_trans = NULL;
C
Chris Mason 已提交
1232
	DEFINE_WAIT(wait);
1233
	int ret = -EIO;
1234 1235
	int should_grow = 0;
	unsigned long now = get_seconds();
1236
	int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
C
Chris Mason 已提交
1237

1238 1239
	btrfs_run_ordered_operations(root, 0);

1240
	btrfs_trans_release_metadata(trans, root);
1241 1242
	trans->block_rsv = NULL;

1243 1244 1245
	if (cur_trans->aborted)
		goto cleanup_transaction;

1246 1247 1248 1249
	/* make a pass through all the delayed refs we have so far
	 * any runnings procs may add more while we are here
	 */
	ret = btrfs_run_delayed_refs(trans, root, 0);
1250 1251
	if (ret)
		goto cleanup_transaction;
1252

1253
	cur_trans = trans->transaction;
1254

1255 1256 1257 1258
	/*
	 * set the flushing flag so procs in this transaction have to
	 * start sending their work down.
	 */
1259
	cur_trans->delayed_refs.flushing = 1;
1260

1261
	ret = btrfs_run_delayed_refs(trans, root, 0);
1262 1263
	if (ret)
		goto cleanup_transaction;
1264

J
Josef Bacik 已提交
1265
	spin_lock(&cur_trans->commit_lock);
1266
	if (cur_trans->in_commit) {
J
Josef Bacik 已提交
1267
		spin_unlock(&cur_trans->commit_lock);
1268
		atomic_inc(&cur_trans->use_count);
1269
		ret = btrfs_end_transaction(trans, root);
C
Chris Mason 已提交
1270

1271
		wait_for_commit(root, cur_trans);
1272

C
Chris Mason 已提交
1273
		put_transaction(cur_trans);
1274

1275
		return ret;
C
Chris Mason 已提交
1276
	}
1277

C
Chris Mason 已提交
1278
	trans->transaction->in_commit = 1;
1279
	trans->transaction->blocked = 1;
J
Josef Bacik 已提交
1280
	spin_unlock(&cur_trans->commit_lock);
S
Sage Weil 已提交
1281 1282
	wake_up(&root->fs_info->transaction_blocked_wait);

J
Josef Bacik 已提交
1283
	spin_lock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1284 1285 1286 1287
	if (cur_trans->list.prev != &root->fs_info->trans_list) {
		prev_trans = list_entry(cur_trans->list.prev,
					struct btrfs_transaction, list);
		if (!prev_trans->commit_done) {
1288
			atomic_inc(&prev_trans->use_count);
J
Josef Bacik 已提交
1289
			spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1290 1291 1292

			wait_for_commit(root, prev_trans);

1293
			put_transaction(prev_trans);
J
Josef Bacik 已提交
1294 1295
		} else {
			spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1296
		}
J
Josef Bacik 已提交
1297 1298
	} else {
		spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1299
	}
1300

1301 1302 1303
	if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
		should_grow = 1;

1304
	do {
1305
		int snap_pending = 0;
J
Josef Bacik 已提交
1306

1307
		joined = cur_trans->num_joined;
1308 1309 1310
		if (!list_empty(&trans->transaction->pending_snapshots))
			snap_pending = 1;

C
Chris Mason 已提交
1311
		WARN_ON(cur_trans != trans->transaction);
1312

1313
		if (flush_on_commit || snap_pending) {
Y
Yan, Zheng 已提交
1314
			btrfs_start_delalloc_inodes(root, 1);
1315
			btrfs_wait_ordered_extents(root, 0, 1);
1316 1317
		}

1318
		ret = btrfs_run_delayed_items(trans, root);
1319 1320
		if (ret)
			goto cleanup_transaction;
1321

1322 1323 1324 1325 1326 1327 1328 1329 1330
		/*
		 * rename don't use btrfs_join_transaction, so, once we
		 * set the transaction to blocked above, we aren't going
		 * to get any new ordered operations.  We can safely run
		 * it here and no for sure that nothing new will be added
		 * to the list
		 */
		btrfs_run_ordered_operations(root, 1);

1331 1332 1333
		prepare_to_wait(&cur_trans->writer_wait, &wait,
				TASK_UNINTERRUPTIBLE);

1334
		if (atomic_read(&cur_trans->num_writers) > 1)
1335 1336 1337
			schedule_timeout(MAX_SCHEDULE_TIMEOUT);
		else if (should_grow)
			schedule_timeout(1);
1338 1339

		finish_wait(&cur_trans->writer_wait, &wait);
1340
	} while (atomic_read(&cur_trans->num_writers) > 1 ||
1341
		 (should_grow && cur_trans->num_joined != joined));
1342

1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353
	/*
	 * Ok now we need to make sure to block out any other joins while we
	 * commit the transaction.  We could have started a join before setting
	 * no_join so make sure to wait for num_writers to == 1 again.
	 */
	spin_lock(&root->fs_info->trans_lock);
	root->fs_info->trans_no_join = 1;
	spin_unlock(&root->fs_info->trans_lock);
	wait_event(cur_trans->writer_wait,
		   atomic_read(&cur_trans->num_writers) == 1);

C
Chris Mason 已提交
1354 1355 1356 1357 1358 1359 1360
	/*
	 * the reloc mutex makes sure that we stop
	 * the balancing code from coming in and moving
	 * extents around in the middle of the commit
	 */
	mutex_lock(&root->fs_info->reloc_mutex);

1361
	ret = btrfs_run_delayed_items(trans, root);
1362 1363 1364 1365
	if (ret) {
		mutex_unlock(&root->fs_info->reloc_mutex);
		goto cleanup_transaction;
	}
1366

1367
	ret = create_pending_snapshots(trans, root->fs_info);
1368 1369 1370 1371
	if (ret) {
		mutex_unlock(&root->fs_info->reloc_mutex);
		goto cleanup_transaction;
	}
1372

1373
	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1374 1375 1376 1377
	if (ret) {
		mutex_unlock(&root->fs_info->reloc_mutex);
		goto cleanup_transaction;
	}
1378

1379 1380 1381 1382 1383 1384
	/*
	 * make sure none of the code above managed to slip in a
	 * delayed item
	 */
	btrfs_assert_delayed_root_empty(root);

C
Chris Mason 已提交
1385
	WARN_ON(cur_trans != trans->transaction);
C
Chris Mason 已提交
1386

A
Arne Jansen 已提交
1387
	btrfs_scrub_pause(root);
1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402
	/* btrfs_commit_tree_roots is responsible for getting the
	 * various roots consistent with each other.  Every pointer
	 * in the tree of tree roots has to point to the most up to date
	 * root for every subvolume and other tree.  So, we have to keep
	 * the tree logging code from jumping in and changing any
	 * of the trees.
	 *
	 * At this point in the commit, there can't be any tree-log
	 * writers, but a little lower down we drop the trans mutex
	 * and let new people in.  By holding the tree_log_mutex
	 * from now until after the super is written, we avoid races
	 * with the tree-log code.
	 */
	mutex_lock(&root->fs_info->tree_log_mutex);

1403
	ret = commit_fs_roots(trans, root);
1404 1405
	if (ret) {
		mutex_unlock(&root->fs_info->tree_log_mutex);
1406
		mutex_unlock(&root->fs_info->reloc_mutex);
1407 1408
		goto cleanup_transaction;
	}
1409

1410
	/* commit_fs_roots gets rid of all the tree log roots, it is now
1411 1412 1413 1414
	 * safe to free the root of tree log roots
	 */
	btrfs_free_log_root_tree(trans, root->fs_info);

1415
	ret = commit_cowonly_roots(trans, root);
1416 1417
	if (ret) {
		mutex_unlock(&root->fs_info->tree_log_mutex);
1418
		mutex_unlock(&root->fs_info->reloc_mutex);
1419 1420
		goto cleanup_transaction;
	}
1421

1422 1423
	btrfs_prepare_extent_commit(trans, root);

C
Chris Mason 已提交
1424
	cur_trans = root->fs_info->running_transaction;
1425 1426 1427

	btrfs_set_root_node(&root->fs_info->tree_root->root_item,
			    root->fs_info->tree_root->node);
J
Josef Bacik 已提交
1428
	switch_commit_root(root->fs_info->tree_root);
1429 1430 1431

	btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
			    root->fs_info->chunk_root->node);
J
Josef Bacik 已提交
1432
	switch_commit_root(root->fs_info->chunk_root);
1433 1434

	update_super_roots(root);
1435 1436

	if (!root->fs_info->log_root_recovering) {
1437 1438
		btrfs_set_super_log_root(root->fs_info->super_copy, 0);
		btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
1439 1440
	}

1441 1442
	memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
	       sizeof(*root->fs_info->super_copy));
C
Chris Mason 已提交
1443

1444
	trans->transaction->blocked = 0;
J
Josef Bacik 已提交
1445 1446 1447 1448
	spin_lock(&root->fs_info->trans_lock);
	root->fs_info->running_transaction = NULL;
	root->fs_info->trans_no_join = 0;
	spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1449
	mutex_unlock(&root->fs_info->reloc_mutex);
1450

1451
	wake_up(&root->fs_info->transaction_wait);
1452

C
Chris Mason 已提交
1453
	ret = btrfs_write_and_wait_transaction(trans, root);
1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465
	if (ret) {
		btrfs_error(root->fs_info, ret,
			    "Error while writing out transaction.");
		mutex_unlock(&root->fs_info->tree_log_mutex);
		goto cleanup_transaction;
	}

	ret = write_ctree_super(trans, root, 0);
	if (ret) {
		mutex_unlock(&root->fs_info->tree_log_mutex);
		goto cleanup_transaction;
	}
1466

1467 1468 1469 1470 1471 1472
	/*
	 * the super is written, we can safely allow the tree-loggers
	 * to go about their business
	 */
	mutex_unlock(&root->fs_info->tree_log_mutex);

1473
	btrfs_finish_extent_commit(trans, root);
1474

C
Chris Mason 已提交
1475
	cur_trans->commit_done = 1;
1476

1477
	root->fs_info->last_trans_committed = cur_trans->transid;
J
Josef Bacik 已提交
1478

C
Chris Mason 已提交
1479
	wake_up(&cur_trans->commit_wait);
1480

J
Josef Bacik 已提交
1481
	spin_lock(&root->fs_info->trans_lock);
1482
	list_del_init(&cur_trans->list);
J
Josef Bacik 已提交
1483 1484
	spin_unlock(&root->fs_info->trans_lock);

C
Chris Mason 已提交
1485
	put_transaction(cur_trans);
C
Chris Mason 已提交
1486
	put_transaction(cur_trans);
1487

1488 1489
	trace_btrfs_transaction_commit(root);

A
Arne Jansen 已提交
1490 1491
	btrfs_scrub_continue(root);

J
Josef Bacik 已提交
1492 1493 1494
	if (current->journal_info == trans)
		current->journal_info = NULL;

C
Chris Mason 已提交
1495
	kmem_cache_free(btrfs_trans_handle_cachep, trans);
Y
Yan, Zheng 已提交
1496 1497 1498 1499

	if (current != root->fs_info->transaction_kthread)
		btrfs_run_delayed_iputs(root);

C
Chris Mason 已提交
1500
	return ret;
1501 1502 1503 1504 1505 1506 1507 1508 1509

cleanup_transaction:
	btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n");
//	WARN_ON(1);
	if (current->journal_info == trans)
		current->journal_info = NULL;
	cleanup_transaction(trans, root);

	return ret;
C
Chris Mason 已提交
1510 1511
}

C
Chris Mason 已提交
1512 1513 1514
/*
 * interface function to delete all the snapshots we have scheduled for deletion
 */
1515 1516
int btrfs_clean_old_snapshots(struct btrfs_root *root)
{
1517 1518 1519
	LIST_HEAD(list);
	struct btrfs_fs_info *fs_info = root->fs_info;

J
Josef Bacik 已提交
1520
	spin_lock(&fs_info->trans_lock);
1521
	list_splice_init(&fs_info->dead_roots, &list);
J
Josef Bacik 已提交
1522
	spin_unlock(&fs_info->trans_lock);
1523

1524
	while (!list_empty(&list)) {
1525 1526
		int ret;

1527
		root = list_entry(list.next, struct btrfs_root, root_list);
1528 1529
		list_del(&root->root_list);

1530 1531
		btrfs_kill_all_delayed_nodes(root);

1532 1533
		if (btrfs_header_backref_rev(root->node) <
		    BTRFS_MIXED_BACKREF_REV)
1534
			ret = btrfs_drop_snapshot(root, NULL, 0, 0);
1535
		else
1536 1537
			ret =btrfs_drop_snapshot(root, NULL, 1, 0);
		BUG_ON(ret < 0);
1538 1539 1540
	}
	return 0;
}