transaction.c 42.4 KB
Newer Older
C
Chris Mason 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

C
Chris Mason 已提交
19
#include <linux/fs.h>
20
#include <linux/slab.h>
C
Chris Mason 已提交
21
#include <linux/sched.h>
22
#include <linux/writeback.h>
23
#include <linux/pagemap.h>
24
#include <linux/blkdev.h>
25
#include <linux/uuid.h>
C
Chris Mason 已提交
26 27 28
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
29
#include "locking.h"
30
#include "tree-log.h"
31
#include "inode-map.h"
32
#include "volumes.h"
C
Chris Mason 已提交
33

34 35
#define BTRFS_ROOT_TRANS_TAG 0

36
void put_transaction(struct btrfs_transaction *transaction)
C
Chris Mason 已提交
37
{
38 39
	WARN_ON(atomic_read(&transaction->use_count) == 0);
	if (atomic_dec_and_test(&transaction->use_count)) {
J
Josef Bacik 已提交
40
		BUG_ON(!list_empty(&transaction->list));
41
		WARN_ON(transaction->delayed_refs.root.rb_node);
C
Chris Mason 已提交
42 43
		memset(transaction, 0, sizeof(*transaction));
		kmem_cache_free(btrfs_transaction_cachep, transaction);
C
Chris Mason 已提交
44
	}
C
Chris Mason 已提交
45 46
}

J
Josef Bacik 已提交
47 48 49 50 51 52
static noinline void switch_commit_root(struct btrfs_root *root)
{
	free_extent_buffer(root->commit_root);
	root->commit_root = btrfs_root_node(root);
}

C
Chris Mason 已提交
53 54 55
/*
 * either allocate a new transaction or hop into the existing one
 */
J
Josef Bacik 已提交
56
static noinline int join_transaction(struct btrfs_root *root, int nofail)
C
Chris Mason 已提交
57 58
{
	struct btrfs_transaction *cur_trans;
59
	struct btrfs_fs_info *fs_info = root->fs_info;
J
Josef Bacik 已提交
60

61
	spin_lock(&fs_info->trans_lock);
62
loop:
63
	/* The file system has been taken offline. No new transactions. */
64 65
	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
		spin_unlock(&fs_info->trans_lock);
66 67 68
		return -EROFS;
	}

69
	if (fs_info->trans_no_join) {
J
Josef Bacik 已提交
70
		if (!nofail) {
71
			spin_unlock(&fs_info->trans_lock);
J
Josef Bacik 已提交
72 73 74 75
			return -EBUSY;
		}
	}

76
	cur_trans = fs_info->running_transaction;
J
Josef Bacik 已提交
77
	if (cur_trans) {
78
		if (cur_trans->aborted) {
79
			spin_unlock(&fs_info->trans_lock);
80
			return cur_trans->aborted;
81
		}
J
Josef Bacik 已提交
82
		atomic_inc(&cur_trans->use_count);
83
		atomic_inc(&cur_trans->num_writers);
84
		cur_trans->num_joined++;
85
		spin_unlock(&fs_info->trans_lock);
J
Josef Bacik 已提交
86
		return 0;
C
Chris Mason 已提交
87
	}
88
	spin_unlock(&fs_info->trans_lock);
J
Josef Bacik 已提交
89 90 91 92

	cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
	if (!cur_trans)
		return -ENOMEM;
93

94 95
	spin_lock(&fs_info->trans_lock);
	if (fs_info->running_transaction) {
96 97 98 99
		/*
		 * someone started a transaction after we unlocked.  Make sure
		 * to redo the trans_no_join checks above
		 */
J
Josef Bacik 已提交
100
		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
101
		cur_trans = fs_info->running_transaction;
102
		goto loop;
103 104
	} else if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
		spin_unlock(&fs_info->trans_lock);
105 106
		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
		return -EROFS;
C
Chris Mason 已提交
107
	}
108

J
Josef Bacik 已提交
109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
	atomic_set(&cur_trans->num_writers, 1);
	cur_trans->num_joined = 0;
	init_waitqueue_head(&cur_trans->writer_wait);
	init_waitqueue_head(&cur_trans->commit_wait);
	cur_trans->in_commit = 0;
	cur_trans->blocked = 0;
	/*
	 * One for this trans handle, one so it will live on until we
	 * commit the transaction.
	 */
	atomic_set(&cur_trans->use_count, 2);
	cur_trans->commit_done = 0;
	cur_trans->start_time = get_seconds();

	cur_trans->delayed_refs.root = RB_ROOT;
	cur_trans->delayed_refs.num_entries = 0;
	cur_trans->delayed_refs.num_heads_ready = 0;
	cur_trans->delayed_refs.num_heads = 0;
	cur_trans->delayed_refs.flushing = 0;
	cur_trans->delayed_refs.run_delayed_start = 0;
129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146

	/*
	 * although the tree mod log is per file system and not per transaction,
	 * the log must never go across transaction boundaries.
	 */
	smp_mb();
	if (!list_empty(&fs_info->tree_mod_seq_list)) {
		printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when "
			"creating a fresh transaction\n");
		WARN_ON(1);
	}
	if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) {
		printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
			"creating a fresh transaction\n");
		WARN_ON(1);
	}
	atomic_set(&fs_info->tree_mod_seq, 0);

J
Josef Bacik 已提交
147 148 149 150
	spin_lock_init(&cur_trans->commit_lock);
	spin_lock_init(&cur_trans->delayed_refs.lock);

	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
151
	list_add_tail(&cur_trans->list, &fs_info->trans_list);
J
Josef Bacik 已提交
152
	extent_io_tree_init(&cur_trans->dirty_pages,
153 154 155 156
			     fs_info->btree_inode->i_mapping);
	fs_info->generation++;
	cur_trans->transid = fs_info->generation;
	fs_info->running_transaction = cur_trans;
157
	cur_trans->aborted = 0;
158
	spin_unlock(&fs_info->trans_lock);
159

C
Chris Mason 已提交
160 161 162
	return 0;
}

C
Chris Mason 已提交
163
/*
C
Chris Mason 已提交
164 165 166 167
 * this does all the record keeping required to make sure that a reference
 * counted root is properly recorded in a given transaction.  This is required
 * to make sure the old root from before we joined the transaction is deleted
 * when the transaction commits
C
Chris Mason 已提交
168
 */
C
Chris Mason 已提交
169
static int record_root_in_trans(struct btrfs_trans_handle *trans,
J
Josef Bacik 已提交
170
			       struct btrfs_root *root)
171
{
172
	if (root->ref_cows && root->last_trans < trans->transid) {
173
		WARN_ON(root == root->fs_info->extent_root);
174 175
		WARN_ON(root->commit_root != root->node);

C
Chris Mason 已提交
176 177 178 179 180 181 182 183 184 185 186 187
		/*
		 * see below for in_trans_setup usage rules
		 * we have the reloc mutex held now, so there
		 * is only one writer in this function
		 */
		root->in_trans_setup = 1;

		/* make sure readers find in_trans_setup before
		 * they find our root->last_trans update
		 */
		smp_wmb();

J
Josef Bacik 已提交
188 189 190 191 192
		spin_lock(&root->fs_info->fs_roots_radix_lock);
		if (root->last_trans == trans->transid) {
			spin_unlock(&root->fs_info->fs_roots_radix_lock);
			return 0;
		}
193 194 195
		radix_tree_tag_set(&root->fs_info->fs_roots_radix,
			   (unsigned long)root->root_key.objectid,
			   BTRFS_ROOT_TRANS_TAG);
J
Josef Bacik 已提交
196
		spin_unlock(&root->fs_info->fs_roots_radix_lock);
C
Chris Mason 已提交
197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
		root->last_trans = trans->transid;

		/* this is pretty tricky.  We don't want to
		 * take the relocation lock in btrfs_record_root_in_trans
		 * unless we're really doing the first setup for this root in
		 * this transaction.
		 *
		 * Normally we'd use root->last_trans as a flag to decide
		 * if we want to take the expensive mutex.
		 *
		 * But, we have to set root->last_trans before we
		 * init the relocation root, otherwise, we trip over warnings
		 * in ctree.c.  The solution used here is to flag ourselves
		 * with root->in_trans_setup.  When this is 1, we're still
		 * fixing up the reloc trees and everyone must wait.
		 *
		 * When this is zero, they can trust root->last_trans and fly
		 * through btrfs_record_root_in_trans without having to take the
		 * lock.  smp_wmb() makes sure that all the writes above are
		 * done before we pop in the zero below
		 */
218
		btrfs_init_reloc_root(trans, root);
C
Chris Mason 已提交
219 220
		smp_wmb();
		root->in_trans_setup = 0;
221 222 223
	}
	return 0;
}
224

C
Chris Mason 已提交
225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247

int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root)
{
	if (!root->ref_cows)
		return 0;

	/*
	 * see record_root_in_trans for comments about in_trans_setup usage
	 * and barriers
	 */
	smp_rmb();
	if (root->last_trans == trans->transid &&
	    !root->in_trans_setup)
		return 0;

	mutex_lock(&root->fs_info->reloc_mutex);
	record_root_in_trans(trans, root);
	mutex_unlock(&root->fs_info->reloc_mutex);

	return 0;
}

C
Chris Mason 已提交
248 249 250 251
/* wait for commit against the current transaction to become unblocked
 * when this is done, it is safe to start a new transaction, but the current
 * transaction might not be fully on disk.
 */
C
Chris Mason 已提交
252
static void wait_current_trans(struct btrfs_root *root)
C
Chris Mason 已提交
253
{
254
	struct btrfs_transaction *cur_trans;
C
Chris Mason 已提交
255

J
Josef Bacik 已提交
256
	spin_lock(&root->fs_info->trans_lock);
257
	cur_trans = root->fs_info->running_transaction;
C
Chris Mason 已提交
258
	if (cur_trans && cur_trans->blocked) {
259
		atomic_inc(&cur_trans->use_count);
J
Josef Bacik 已提交
260
		spin_unlock(&root->fs_info->trans_lock);
L
Li Zefan 已提交
261 262 263

		wait_event(root->fs_info->transaction_wait,
			   !cur_trans->blocked);
264
		put_transaction(cur_trans);
J
Josef Bacik 已提交
265 266
	} else {
		spin_unlock(&root->fs_info->trans_lock);
267
	}
C
Chris Mason 已提交
268 269
}

270 271 272 273
enum btrfs_trans_type {
	TRANS_START,
	TRANS_JOIN,
	TRANS_USERSPACE,
274
	TRANS_JOIN_NOLOCK,
275 276
};

277 278
static int may_wait_transaction(struct btrfs_root *root, int type)
{
J
Josef Bacik 已提交
279 280 281 282 283 284 285 286
	if (root->fs_info->log_root_recovering)
		return 0;

	if (type == TRANS_USERSPACE)
		return 1;

	if (type == TRANS_START &&
	    !atomic_read(&root->fs_info->open_ioctl_trans))
287
		return 1;
J
Josef Bacik 已提交
288

289 290 291
	return 0;
}

292
static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
293
						    u64 num_items, int type)
C
Chris Mason 已提交
294
{
295 296
	struct btrfs_trans_handle *h;
	struct btrfs_transaction *cur_trans;
297
	u64 num_bytes = 0;
C
Chris Mason 已提交
298
	int ret;
299
	u64 qgroup_reserved = 0;
L
liubo 已提交
300 301 302

	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
		return ERR_PTR(-EROFS);
303 304 305 306 307 308 309 310 311

	if (current->journal_info) {
		WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
		h = current->journal_info;
		h->use_count++;
		h->orig_rsv = h->block_rsv;
		h->block_rsv = NULL;
		goto got_it;
	}
312 313 314 315 316 317

	/*
	 * Do the reservation before we join the transaction so we can do all
	 * the appropriate flushing if need be.
	 */
	if (num_items > 0 && root != root->fs_info->chunk_root) {
318 319 320 321 322 323 324 325
		if (root->fs_info->quota_enabled &&
		    is_fstree(root->root_key.objectid)) {
			qgroup_reserved = num_items * root->leafsize;
			ret = btrfs_qgroup_reserve(root, qgroup_reserved);
			if (ret)
				return ERR_PTR(ret);
		}

326
		num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
327
		ret = btrfs_block_rsv_add(root,
328 329 330 331 332
					  &root->fs_info->trans_block_rsv,
					  num_bytes);
		if (ret)
			return ERR_PTR(ret);
	}
333 334 335 336
again:
	h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
	if (!h)
		return ERR_PTR(-ENOMEM);
C
Chris Mason 已提交
337

338
	if (may_wait_transaction(root, type))
C
Chris Mason 已提交
339
		wait_current_trans(root);
340

J
Josef Bacik 已提交
341 342 343 344 345 346
	do {
		ret = join_transaction(root, type == TRANS_JOIN_NOLOCK);
		if (ret == -EBUSY)
			wait_current_trans(root);
	} while (ret == -EBUSY);

T
Tsutomu Itoh 已提交
347
	if (ret < 0) {
348
		kmem_cache_free(btrfs_trans_handle_cachep, h);
T
Tsutomu Itoh 已提交
349 350
		return ERR_PTR(ret);
	}
351

352 353 354 355
	cur_trans = root->fs_info->running_transaction;

	h->transid = cur_trans->transid;
	h->transaction = cur_trans;
C
Chris Mason 已提交
356
	h->blocks_used = 0;
357
	h->bytes_reserved = 0;
358
	h->root = root;
359
	h->delayed_ref_updates = 0;
360
	h->use_count = 1;
361
	h->adding_csums = 0;
362
	h->block_rsv = NULL;
363
	h->orig_rsv = NULL;
364
	h->aborted = 0;
365
	h->qgroup_reserved = qgroup_reserved;
366 367
	h->delayed_ref_elem.seq = 0;
	INIT_LIST_HEAD(&h->qgroup_ref_list);
368

369 370 371 372 373 374
	smp_mb();
	if (cur_trans->blocked && may_wait_transaction(root, type)) {
		btrfs_commit_transaction(h, root);
		goto again;
	}

375
	if (num_bytes) {
J
Josef Bacik 已提交
376
		trace_btrfs_space_reservation(root->fs_info, "transaction",
377
					      h->transid, num_bytes, 1);
378 379
		h->block_rsv = &root->fs_info->trans_block_rsv;
		h->bytes_reserved = num_bytes;
380
	}
J
Josef Bacik 已提交
381

382
got_it:
J
Josef Bacik 已提交
383
	btrfs_record_root_in_trans(h, root);
384 385 386

	if (!current->journal_info && type != TRANS_USERSPACE)
		current->journal_info = h;
C
Chris Mason 已提交
387 388 389
	return h;
}

390
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
391
						   int num_items)
392
{
393
	return start_transaction(root, num_items, TRANS_START);
394
}
395
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
396
{
397
	return start_transaction(root, 0, TRANS_JOIN);
398 399
}

400
struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
401 402 403 404
{
	return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
}

405
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
406
{
407
	return start_transaction(root, 0, TRANS_USERSPACE);
408 409
}

C
Chris Mason 已提交
410
/* wait for a transaction commit to be fully complete */
411
static noinline void wait_for_commit(struct btrfs_root *root,
412 413
				    struct btrfs_transaction *commit)
{
L
Li Zefan 已提交
414
	wait_event(commit->commit_wait, commit->commit_done);
415 416
}

417 418 419 420 421 422 423 424
int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
{
	struct btrfs_transaction *cur_trans = NULL, *t;
	int ret;

	ret = 0;
	if (transid) {
		if (transid <= root->fs_info->last_trans_committed)
J
Josef Bacik 已提交
425
			goto out;
426 427

		/* find specified transaction */
J
Josef Bacik 已提交
428
		spin_lock(&root->fs_info->trans_lock);
429 430 431
		list_for_each_entry(t, &root->fs_info->trans_list, list) {
			if (t->transid == transid) {
				cur_trans = t;
J
Josef Bacik 已提交
432
				atomic_inc(&cur_trans->use_count);
433 434 435 436 437
				break;
			}
			if (t->transid > transid)
				break;
		}
J
Josef Bacik 已提交
438
		spin_unlock(&root->fs_info->trans_lock);
439 440
		ret = -EINVAL;
		if (!cur_trans)
J
Josef Bacik 已提交
441
			goto out;  /* bad transid */
442 443
	} else {
		/* find newest transaction that is committing | committed */
J
Josef Bacik 已提交
444
		spin_lock(&root->fs_info->trans_lock);
445 446 447 448
		list_for_each_entry_reverse(t, &root->fs_info->trans_list,
					    list) {
			if (t->in_commit) {
				if (t->commit_done)
449
					break;
450
				cur_trans = t;
J
Josef Bacik 已提交
451
				atomic_inc(&cur_trans->use_count);
452 453 454
				break;
			}
		}
J
Josef Bacik 已提交
455
		spin_unlock(&root->fs_info->trans_lock);
456
		if (!cur_trans)
J
Josef Bacik 已提交
457
			goto out;  /* nothing committing|committed */
458 459 460 461 462 463
	}

	wait_for_commit(root, cur_trans);

	put_transaction(cur_trans);
	ret = 0;
J
Josef Bacik 已提交
464
out:
465 466 467
	return ret;
}

C
Chris Mason 已提交
468 469
void btrfs_throttle(struct btrfs_root *root)
{
J
Josef Bacik 已提交
470
	if (!atomic_read(&root->fs_info->open_ioctl_trans))
471
		wait_current_trans(root);
C
Chris Mason 已提交
472 473
}

474 475 476 477
static int should_end_transaction(struct btrfs_trans_handle *trans,
				  struct btrfs_root *root)
{
	int ret;
478 479

	ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
480 481 482 483 484 485 486 487
	return ret ? 1 : 0;
}

int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root)
{
	struct btrfs_transaction *cur_trans = trans->transaction;
	int updates;
488
	int err;
489

J
Josef Bacik 已提交
490
	smp_mb();
491 492 493 494 495
	if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
		return 1;

	updates = trans->delayed_ref_updates;
	trans->delayed_ref_updates = 0;
496 497 498 499 500
	if (updates) {
		err = btrfs_run_delayed_refs(trans, root, updates);
		if (err) /* Error code will also eval true */
			return err;
	}
501 502 503 504

	return should_end_transaction(trans, root);
}

505
static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
506
			  struct btrfs_root *root, int throttle, int lock)
C
Chris Mason 已提交
507
{
508
	struct btrfs_transaction *cur_trans = trans->transaction;
509
	struct btrfs_fs_info *info = root->fs_info;
510
	int count = 0;
511
	int err = 0;
512

513 514 515 516 517
	if (--trans->use_count) {
		trans->block_rsv = trans->orig_rsv;
		return 0;
	}

518 519 520 521 522
	/*
	 * do the qgroup accounting as early as possible
	 */
	err = btrfs_delayed_refs_qgroup_accounting(trans, info);

523
	btrfs_trans_release_metadata(trans, root);
524
	trans->block_rsv = NULL;
525 526 527 528 529
	/*
	 * the same root has to be passed to start_transaction and
	 * end_transaction. Subvolume quota depends on this.
	 */
	WARN_ON(trans->root != root);
530 531 532 533 534 535

	if (trans->qgroup_reserved) {
		btrfs_qgroup_free(root, trans->qgroup_reserved);
		trans->qgroup_reserved = 0;
	}

536
	while (count < 2) {
537 538 539 540 541 542 543 544 545 546
		unsigned long cur = trans->delayed_ref_updates;
		trans->delayed_ref_updates = 0;
		if (cur &&
		    trans->transaction->delayed_refs.num_heads_ready > 64) {
			trans->delayed_ref_updates = 0;
			btrfs_run_delayed_refs(trans, root, cur);
		} else {
			break;
		}
		count++;
547
	}
548 549
	btrfs_trans_release_metadata(trans, root);
	trans->block_rsv = NULL;
550

J
Josef Bacik 已提交
551 552
	if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
	    should_end_transaction(trans, root)) {
553
		trans->transaction->blocked = 1;
J
Josef Bacik 已提交
554 555
		smp_wmb();
	}
556

557
	if (lock && cur_trans->blocked && !cur_trans->in_commit) {
558 559 560 561 562 563 564
		if (throttle) {
			/*
			 * We may race with somebody else here so end up having
			 * to call end_transaction on ourselves again, so inc
			 * our use_count.
			 */
			trans->use_count++;
565
			return btrfs_commit_transaction(trans, root);
566
		} else {
567
			wake_up_process(info->transaction_kthread);
568
		}
569 570 571
	}

	WARN_ON(cur_trans != info->running_transaction);
572 573
	WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
	atomic_dec(&cur_trans->num_writers);
574

575
	smp_mb();
C
Chris Mason 已提交
576 577 578
	if (waitqueue_active(&cur_trans->writer_wait))
		wake_up(&cur_trans->writer_wait);
	put_transaction(cur_trans);
J
Josef Bacik 已提交
579 580 581

	if (current->journal_info == trans)
		current->journal_info = NULL;
582

Y
Yan, Zheng 已提交
583 584 585
	if (throttle)
		btrfs_run_delayed_iputs(root);

586 587
	if (trans->aborted ||
	    root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
588
		err = -EIO;
589
	}
590
	assert_qgroups_uptodate(trans);
591

592 593 594
	memset(trans, 0, sizeof(*trans));
	kmem_cache_free(btrfs_trans_handle_cachep, trans);
	return err;
C
Chris Mason 已提交
595 596
}

597 598 599
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root)
{
600 601 602 603 604 605
	int ret;

	ret = __btrfs_end_transaction(trans, root, 0, 1);
	if (ret)
		return ret;
	return 0;
606 607 608 609 610
}

int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root)
{
611 612 613 614 615 616
	int ret;

	ret = __btrfs_end_transaction(trans, root, 1, 1);
	if (ret)
		return ret;
	return 0;
617 618 619 620 621
}

int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root)
{
622 623 624 625 626 627 628 629 630 631 632 633
	int ret;

	ret = __btrfs_end_transaction(trans, root, 0, 0);
	if (ret)
		return ret;
	return 0;
}

int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
				struct btrfs_root *root)
{
	return __btrfs_end_transaction(trans, root, 1, 1);
634 635
}

C
Chris Mason 已提交
636 637 638
/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
639
 * those extents are sent to disk but does not wait on them
C
Chris Mason 已提交
640
 */
641
int btrfs_write_marked_extents(struct btrfs_root *root,
642
			       struct extent_io_tree *dirty_pages, int mark)
C
Chris Mason 已提交
643
{
644
	int err = 0;
645
	int werr = 0;
J
Josef Bacik 已提交
646
	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
647
	u64 start = 0;
648
	u64 end;
649

J
Josef Bacik 已提交
650 651 652 653 654 655 656 657 658
	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
				      mark)) {
		convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
				   GFP_NOFS);
		err = filemap_fdatawrite_range(mapping, start, end);
		if (err)
			werr = err;
		cond_resched();
		start = end + 1;
659
	}
660 661 662 663 664 665 666 667 668 669 670 671
	if (err)
		werr = err;
	return werr;
}

/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
 * those extents are on disk for transaction or log commit.  We wait
 * on all the pages and clear them from the dirty pages state tree
 */
int btrfs_wait_marked_extents(struct btrfs_root *root,
672
			      struct extent_io_tree *dirty_pages, int mark)
673 674 675
{
	int err = 0;
	int werr = 0;
J
Josef Bacik 已提交
676
	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
677 678
	u64 start = 0;
	u64 end;
679

J
Josef Bacik 已提交
680 681 682 683 684 685 686 687
	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
				      EXTENT_NEED_WAIT)) {
		clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
		err = filemap_fdatawait_range(mapping, start, end);
		if (err)
			werr = err;
		cond_resched();
		start = end + 1;
688
	}
689 690 691
	if (err)
		werr = err;
	return werr;
C
Chris Mason 已提交
692 693
}

694 695 696 697 698 699
/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
 * those extents are on disk for transaction or log commit
 */
int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
700
				struct extent_io_tree *dirty_pages, int mark)
701 702 703 704
{
	int ret;
	int ret2;

705 706
	ret = btrfs_write_marked_extents(root, dirty_pages, mark);
	ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
707 708 709 710 711 712

	if (ret)
		return ret;
	if (ret2)
		return ret2;
	return 0;
713 714
}

715 716 717 718 719 720 721 722 723
int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
				     struct btrfs_root *root)
{
	if (!trans || !trans->transaction) {
		struct inode *btree_inode;
		btree_inode = root->fs_info->btree_inode;
		return filemap_write_and_wait(btree_inode->i_mapping);
	}
	return btrfs_write_and_wait_marked_extents(root,
724 725
					   &trans->transaction->dirty_pages,
					   EXTENT_DIRTY);
726 727
}

C
Chris Mason 已提交
728 729 730 731 732 733 734 735 736 737
/*
 * this is used to update the root pointer in the tree of tree roots.
 *
 * But, in the case of the extent allocation tree, updating the root
 * pointer may allocate blocks which may change the root of the extent
 * allocation tree.
 *
 * So, this loops and repeats and makes sure the cowonly root didn't
 * change while the root pointer was being updated in the metadata.
 */
738 739
static int update_cowonly_root(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root)
C
Chris Mason 已提交
740 741
{
	int ret;
742
	u64 old_root_bytenr;
743
	u64 old_root_used;
744
	struct btrfs_root *tree_root = root->fs_info->tree_root;
C
Chris Mason 已提交
745

746
	old_root_used = btrfs_root_used(&root->root_item);
747
	btrfs_write_dirty_block_groups(trans, root);
748

C
Chris Mason 已提交
749
	while (1) {
750
		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
751 752
		if (old_root_bytenr == root->node->start &&
		    old_root_used == btrfs_root_used(&root->root_item))
C
Chris Mason 已提交
753
			break;
754

755
		btrfs_set_root_node(&root->root_item, root->node);
C
Chris Mason 已提交
756
		ret = btrfs_update_root(trans, tree_root,
757 758
					&root->root_key,
					&root->root_item);
759 760
		if (ret)
			return ret;
761

762
		old_root_used = btrfs_root_used(&root->root_item);
763
		ret = btrfs_write_dirty_block_groups(trans, root);
764 765
		if (ret)
			return ret;
766
	}
767 768 769 770

	if (root != root->fs_info->extent_root)
		switch_commit_root(root);

771 772 773
	return 0;
}

C
Chris Mason 已提交
774 775
/*
 * update all the cowonly tree roots on disk
776 777 778 779
 *
 * The error handling in this function may not be obvious. Any of the
 * failures will cause the file system to go offline. We still need
 * to clean up the delayed refs.
C
Chris Mason 已提交
780
 */
781 782
static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
					 struct btrfs_root *root)
783 784 785
{
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct list_head *next;
786
	struct extent_buffer *eb;
787
	int ret;
788

789
	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
790 791
	if (ret)
		return ret;
792

793
	eb = btrfs_lock_root_node(fs_info->tree_root);
794 795
	ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
			      0, &eb);
796 797
	btrfs_tree_unlock(eb);
	free_extent_buffer(eb);
798

799 800 801
	if (ret)
		return ret;

802
	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
803 804
	if (ret)
		return ret;
805

806 807 808
	ret = btrfs_run_dev_stats(trans, root->fs_info);
	BUG_ON(ret);

809 810 811 812 813 814 815
	ret = btrfs_run_qgroups(trans, root->fs_info);
	BUG_ON(ret);

	/* run_qgroups might have added some more refs */
	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
	BUG_ON(ret);

C
Chris Mason 已提交
816
	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
817 818 819
		next = fs_info->dirty_cowonly_roots.next;
		list_del_init(next);
		root = list_entry(next, struct btrfs_root, dirty_list);
820

821 822 823
		ret = update_cowonly_root(trans, root);
		if (ret)
			return ret;
C
Chris Mason 已提交
824
	}
825 826 827 828 829

	down_write(&fs_info->extent_commit_sem);
	switch_commit_root(fs_info->extent_root);
	up_write(&fs_info->extent_commit_sem);

C
Chris Mason 已提交
830 831 832
	return 0;
}

C
Chris Mason 已提交
833 834 835 836 837
/*
 * dead roots are old snapshots that need to be deleted.  This allocates
 * a dirty root struct and adds it into the list of dead roots that need to
 * be deleted
 */
838
int btrfs_add_dead_root(struct btrfs_root *root)
839
{
J
Josef Bacik 已提交
840
	spin_lock(&root->fs_info->trans_lock);
841
	list_add(&root->root_list, &root->fs_info->dead_roots);
J
Josef Bacik 已提交
842
	spin_unlock(&root->fs_info->trans_lock);
843 844 845
	return 0;
}

C
Chris Mason 已提交
846
/*
847
 * update all the cowonly tree roots on disk
C
Chris Mason 已提交
848
 */
849 850
static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
				    struct btrfs_root *root)
851 852
{
	struct btrfs_root *gang[8];
853
	struct btrfs_fs_info *fs_info = root->fs_info;
854 855
	int i;
	int ret;
856 857
	int err = 0;

J
Josef Bacik 已提交
858
	spin_lock(&fs_info->fs_roots_radix_lock);
C
Chris Mason 已提交
859
	while (1) {
860 861
		ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
						 (void **)gang, 0,
862 863 864 865 866 867
						 ARRAY_SIZE(gang),
						 BTRFS_ROOT_TRANS_TAG);
		if (ret == 0)
			break;
		for (i = 0; i < ret; i++) {
			root = gang[i];
868 869 870
			radix_tree_tag_clear(&fs_info->fs_roots_radix,
					(unsigned long)root->root_key.objectid,
					BTRFS_ROOT_TRANS_TAG);
J
Josef Bacik 已提交
871
			spin_unlock(&fs_info->fs_roots_radix_lock);
Y
Yan Zheng 已提交
872

873
			btrfs_free_log(trans, root);
874
			btrfs_update_reloc_root(trans, root);
875
			btrfs_orphan_commit_root(trans, root);
876

877 878
			btrfs_save_ino_cache(root, trans);

879 880 881 882
			/* see comments in should_cow_block() */
			root->force_cow = 0;
			smp_wmb();

883
			if (root->commit_root != root->node) {
884
				mutex_lock(&root->fs_commit_mutex);
J
Josef Bacik 已提交
885
				switch_commit_root(root);
886 887 888
				btrfs_unpin_free_ino(root);
				mutex_unlock(&root->fs_commit_mutex);

889 890 891
				btrfs_set_root_node(&root->root_item,
						    root->node);
			}
892 893

			err = btrfs_update_root(trans, fs_info->tree_root,
894 895
						&root->root_key,
						&root->root_item);
J
Josef Bacik 已提交
896
			spin_lock(&fs_info->fs_roots_radix_lock);
897 898
			if (err)
				break;
899 900
		}
	}
J
Josef Bacik 已提交
901
	spin_unlock(&fs_info->fs_roots_radix_lock);
902
	return err;
903 904
}

C
Chris Mason 已提交
905 906 907 908
/*
 * defrag a given btree.  If cacheonly == 1, this won't read from the disk,
 * otherwise every leaf in the btree is read and defragged.
 */
909 910 911 912
int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
{
	struct btrfs_fs_info *info = root->fs_info;
	struct btrfs_trans_handle *trans;
913
	int ret;
914
	unsigned long nr;
915

916
	if (xchg(&root->defrag_running, 1))
917
		return 0;
918

919
	while (1) {
920 921 922 923
		trans = btrfs_start_transaction(root, 0);
		if (IS_ERR(trans))
			return PTR_ERR(trans);

924
		ret = btrfs_defrag_leaves(trans, root, cacheonly);
925

926
		nr = trans->blocks_used;
927
		btrfs_end_transaction(trans, root);
928
		btrfs_btree_balance_dirty(info->tree_root, nr);
929 930
		cond_resched();

931
		if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
932 933 934
			break;
	}
	root->defrag_running = 0;
935
	return ret;
936 937
}

C
Chris Mason 已提交
938 939 940 941
/*
 * new snapshots need to be created at a very specific time in the
 * transaction commit.  This does the actual creation
 */
942
static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
943 944 945 946
				   struct btrfs_fs_info *fs_info,
				   struct btrfs_pending_snapshot *pending)
{
	struct btrfs_key key;
947
	struct btrfs_root_item *new_root_item;
948 949
	struct btrfs_root *tree_root = fs_info->tree_root;
	struct btrfs_root *root = pending->root;
950
	struct btrfs_root *parent_root;
L
Liu Bo 已提交
951
	struct btrfs_block_rsv *rsv;
952
	struct inode *parent_inode;
953
	struct dentry *parent;
954
	struct dentry *dentry;
955
	struct extent_buffer *tmp;
956
	struct extent_buffer *old;
957
	struct timespec cur_time = CURRENT_TIME;
958
	int ret;
959
	u64 to_reserve = 0;
960
	u64 index = 0;
961
	u64 objectid;
L
Li Zefan 已提交
962
	u64 root_flags;
963
	uuid_le new_uuid;
964

L
Liu Bo 已提交
965 966
	rsv = trans->block_rsv;

967 968
	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
	if (!new_root_item) {
969
		ret = pending->error = -ENOMEM;
970 971
		goto fail;
	}
972

973
	ret = btrfs_find_free_objectid(tree_root, &objectid);
974 975
	if (ret) {
		pending->error = ret;
976
		goto fail;
977
	}
978

979
	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
980 981

	if (to_reserve > 0) {
982 983
		ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv,
						  to_reserve);
984 985 986 987 988 989
		if (ret) {
			pending->error = ret;
			goto fail;
		}
	}

A
Arne Jansen 已提交
990 991 992 993 994 995 996 997
	ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid,
				   objectid, pending->inherit);
	kfree(pending->inherit);
	if (ret) {
		pending->error = ret;
		goto fail;
	}

998
	key.objectid = objectid;
999 1000
	key.offset = (u64)-1;
	key.type = BTRFS_ROOT_ITEM_KEY;
1001

1002
	trans->block_rsv = &pending->block_rsv;
1003

1004
	dentry = pending->dentry;
1005 1006
	parent = dget_parent(dentry);
	parent_inode = parent->d_inode;
1007
	parent_root = BTRFS_I(parent_inode)->root;
C
Chris Mason 已提交
1008
	record_root_in_trans(trans, parent_root);
1009

1010 1011 1012
	/*
	 * insert the directory item
	 */
1013
	ret = btrfs_set_inode_index(parent_inode, &index);
1014
	BUG_ON(ret); /* -ENOMEM */
1015
	ret = btrfs_insert_dir_item(trans, parent_root,
1016
				dentry->d_name.name, dentry->d_name.len,
1017
				parent_inode, &key,
1018
				BTRFS_FT_DIR, index);
1019
	if (ret == -EEXIST) {
1020 1021 1022
		pending->error = -EEXIST;
		dput(parent);
		goto fail;
1023 1024 1025
	} else if (ret) {
		goto abort_trans_dput;
	}
1026

1027 1028
	btrfs_i_size_write(parent_inode, parent_inode->i_size +
					 dentry->d_name.len * 2);
1029
	parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
1030
	ret = btrfs_update_inode(trans, parent_root, parent_inode);
1031
	if (ret)
1032
		goto abort_trans_dput;
1033

1034 1035 1036 1037 1038 1039 1040
	/*
	 * pull in the delayed directory update
	 * and the delayed inode item
	 * otherwise we corrupt the FS during
	 * snapshot
	 */
	ret = btrfs_run_delayed_items(trans, root);
1041 1042
	if (ret) { /* Transaction aborted */
		dput(parent);
1043
		goto fail;
1044
	}
1045

C
Chris Mason 已提交
1046
	record_root_in_trans(trans, root);
1047 1048
	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
1049
	btrfs_check_and_init_root_item(new_root_item);
1050

L
Li Zefan 已提交
1051 1052 1053 1054 1055 1056 1057
	root_flags = btrfs_root_flags(new_root_item);
	if (pending->readonly)
		root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
	else
		root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
	btrfs_set_root_flags(new_root_item, root_flags);

1058 1059 1060 1061 1062 1063 1064
	btrfs_set_root_generation_v2(new_root_item,
			trans->transid);
	uuid_le_gen(&new_uuid);
	memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
	memcpy(new_root_item->parent_uuid, root->root_item.uuid,
			BTRFS_UUID_SIZE);
	new_root_item->otime.sec = cpu_to_le64(cur_time.tv_sec);
1065
	new_root_item->otime.nsec = cpu_to_le32(cur_time.tv_nsec);
1066 1067 1068 1069 1070 1071
	btrfs_set_root_otransid(new_root_item, trans->transid);
	memset(&new_root_item->stime, 0, sizeof(new_root_item->stime));
	memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime));
	btrfs_set_root_stransid(new_root_item, 0);
	btrfs_set_root_rtransid(new_root_item, 0);

1072
	old = btrfs_lock_root_node(root);
1073
	ret = btrfs_cow_block(trans, root, old, NULL, 0, &old);
1074 1075 1076 1077 1078
	if (ret) {
		btrfs_tree_unlock(old);
		free_extent_buffer(old);
		goto abort_trans_dput;
	}
1079

1080 1081
	btrfs_set_lock_blocking(old);

1082
	ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
1083
	/* clean up in any case */
1084 1085
	btrfs_tree_unlock(old);
	free_extent_buffer(old);
1086 1087
	if (ret)
		goto abort_trans_dput;
1088

1089 1090 1091 1092
	/* see comments in should_cow_block() */
	root->force_cow = 1;
	smp_wmb();

1093
	btrfs_set_root_node(new_root_item, tmp);
1094 1095 1096
	/* record when the snapshot was created in key.offset */
	key.offset = trans->transid;
	ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
1097 1098
	btrfs_tree_unlock(tmp);
	free_extent_buffer(tmp);
1099
	if (ret)
1100
		goto abort_trans_dput;
1101

1102 1103 1104 1105
	/*
	 * insert root back/forward references
	 */
	ret = btrfs_add_root_ref(trans, tree_root, objectid,
1106
				 parent_root->root_key.objectid,
L
Li Zefan 已提交
1107
				 btrfs_ino(parent_inode), index,
1108
				 dentry->d_name.name, dentry->d_name.len);
1109
	dput(parent);
1110 1111
	if (ret)
		goto fail;
1112

1113 1114
	key.offset = (u64)-1;
	pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
1115 1116
	if (IS_ERR(pending->snap)) {
		ret = PTR_ERR(pending->snap);
1117
		goto abort_trans;
1118
	}
1119

1120 1121 1122 1123
	ret = btrfs_reloc_post_snapshot(trans, pending);
	if (ret)
		goto abort_trans;
	ret = 0;
1124
fail:
1125
	kfree(new_root_item);
L
Liu Bo 已提交
1126
	trans->block_rsv = rsv;
1127
	btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
1128 1129
	return ret;

1130 1131
abort_trans_dput:
	dput(parent);
1132 1133 1134
abort_trans:
	btrfs_abort_transaction(trans, root, ret);
	goto fail;
1135 1136
}

C
Chris Mason 已提交
1137 1138 1139
/*
 * create all the snapshots we've scheduled for creation
 */
1140 1141
static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
					     struct btrfs_fs_info *fs_info)
1142 1143 1144 1145
{
	struct btrfs_pending_snapshot *pending;
	struct list_head *head = &trans->transaction->pending_snapshots;

1146 1147
	list_for_each_entry(pending, head, list)
		create_pending_snapshot(trans, fs_info, pending);
1148 1149 1150
	return 0;
}

1151 1152 1153 1154 1155
static void update_super_roots(struct btrfs_root *root)
{
	struct btrfs_root_item *root_item;
	struct btrfs_super_block *super;

1156
	super = root->fs_info->super_copy;
1157 1158 1159 1160 1161 1162 1163 1164 1165 1166

	root_item = &root->fs_info->chunk_root->root_item;
	super->chunk_root = root_item->bytenr;
	super->chunk_root_generation = root_item->generation;
	super->chunk_root_level = root_item->level;

	root_item = &root->fs_info->tree_root->root_item;
	super->root = root_item->bytenr;
	super->generation = root_item->generation;
	super->root_level = root_item->level;
1167
	if (btrfs_test_opt(root, SPACE_CACHE))
1168
		super->cache_generation = root_item->generation;
1169 1170
}

1171 1172 1173
int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
{
	int ret = 0;
J
Josef Bacik 已提交
1174
	spin_lock(&info->trans_lock);
1175 1176
	if (info->running_transaction)
		ret = info->running_transaction->in_commit;
J
Josef Bacik 已提交
1177
	spin_unlock(&info->trans_lock);
1178 1179 1180
	return ret;
}

1181 1182 1183
int btrfs_transaction_blocked(struct btrfs_fs_info *info)
{
	int ret = 0;
J
Josef Bacik 已提交
1184
	spin_lock(&info->trans_lock);
1185 1186
	if (info->running_transaction)
		ret = info->running_transaction->blocked;
J
Josef Bacik 已提交
1187
	spin_unlock(&info->trans_lock);
1188 1189 1190
	return ret;
}

S
Sage Weil 已提交
1191 1192 1193 1194 1195 1196 1197
/*
 * wait for the current transaction commit to start and block subsequent
 * transaction joins
 */
static void wait_current_trans_commit_start(struct btrfs_root *root,
					    struct btrfs_transaction *trans)
{
L
Li Zefan 已提交
1198
	wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit);
S
Sage Weil 已提交
1199 1200 1201 1202 1203 1204 1205 1206 1207
}

/*
 * wait for the current transaction to start and then become unblocked.
 * caller holds ref.
 */
static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
					 struct btrfs_transaction *trans)
{
L
Li Zefan 已提交
1208 1209
	wait_event(root->fs_info->transaction_wait,
		   trans->commit_done || (trans->in_commit && !trans->blocked));
S
Sage Weil 已提交
1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238
}

/*
 * commit transactions asynchronously. once btrfs_commit_transaction_async
 * returns, any subsequent transaction will not be allowed to join.
 */
struct btrfs_async_commit {
	struct btrfs_trans_handle *newtrans;
	struct btrfs_root *root;
	struct delayed_work work;
};

static void do_async_commit(struct work_struct *work)
{
	struct btrfs_async_commit *ac =
		container_of(work, struct btrfs_async_commit, work.work);

	btrfs_commit_transaction(ac->newtrans, ac->root);
	kfree(ac);
}

int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root,
				   int wait_for_unblock)
{
	struct btrfs_async_commit *ac;
	struct btrfs_transaction *cur_trans;

	ac = kmalloc(sizeof(*ac), GFP_NOFS);
T
Tsutomu Itoh 已提交
1239 1240
	if (!ac)
		return -ENOMEM;
S
Sage Weil 已提交
1241 1242 1243

	INIT_DELAYED_WORK(&ac->work, do_async_commit);
	ac->root = root;
1244
	ac->newtrans = btrfs_join_transaction(root);
1245 1246 1247 1248 1249
	if (IS_ERR(ac->newtrans)) {
		int err = PTR_ERR(ac->newtrans);
		kfree(ac);
		return err;
	}
S
Sage Weil 已提交
1250 1251 1252

	/* take transaction reference */
	cur_trans = trans->transaction;
1253
	atomic_inc(&cur_trans->use_count);
S
Sage Weil 已提交
1254 1255 1256 1257 1258 1259 1260 1261 1262 1263

	btrfs_end_transaction(trans, root);
	schedule_delayed_work(&ac->work, 0);

	/* wait for transaction to start and unblock */
	if (wait_for_unblock)
		wait_current_trans_commit_start_and_unblock(root, cur_trans);
	else
		wait_current_trans_commit_start(root, cur_trans);

1264 1265 1266 1267
	if (current->journal_info == trans)
		current->journal_info = NULL;

	put_transaction(cur_trans);
S
Sage Weil 已提交
1268 1269 1270
	return 0;
}

1271 1272

static void cleanup_transaction(struct btrfs_trans_handle *trans,
1273
				struct btrfs_root *root, int err)
1274 1275 1276 1277 1278
{
	struct btrfs_transaction *cur_trans = trans->transaction;

	WARN_ON(trans->use_count > 1);

1279 1280
	btrfs_abort_transaction(trans, root, err);

1281 1282
	spin_lock(&root->fs_info->trans_lock);
	list_del_init(&cur_trans->list);
1283 1284 1285 1286
	if (cur_trans == root->fs_info->running_transaction) {
		root->fs_info->running_transaction = NULL;
		root->fs_info->trans_no_join = 0;
	}
1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303
	spin_unlock(&root->fs_info->trans_lock);

	btrfs_cleanup_one_transaction(trans->transaction, root);

	put_transaction(cur_trans);
	put_transaction(cur_trans);

	trace_btrfs_transaction_commit(root);

	btrfs_scrub_continue(root);

	if (current->journal_info == trans)
		current->journal_info = NULL;

	kmem_cache_free(btrfs_trans_handle_cachep, trans);
}

S
Sage Weil 已提交
1304 1305 1306 1307 1308 1309 1310
/*
 * btrfs_transaction state sequence:
 *    in_commit = 0, blocked = 0  (initial)
 *    in_commit = 1, blocked = 1
 *    blocked = 0
 *    commit_done = 1
 */
C
Chris Mason 已提交
1311 1312 1313
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
			     struct btrfs_root *root)
{
1314
	unsigned long joined = 0;
1315
	struct btrfs_transaction *cur_trans = trans->transaction;
C
Chris Mason 已提交
1316
	struct btrfs_transaction *prev_trans = NULL;
C
Chris Mason 已提交
1317
	DEFINE_WAIT(wait);
1318
	int ret = -EIO;
1319 1320
	int should_grow = 0;
	unsigned long now = get_seconds();
1321
	int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
C
Chris Mason 已提交
1322

1323 1324
	btrfs_run_ordered_operations(root, 0);

1325 1326 1327
	if (cur_trans->aborted)
		goto cleanup_transaction;

1328 1329 1330 1331
	/* make a pass through all the delayed refs we have so far
	 * any runnings procs may add more while we are here
	 */
	ret = btrfs_run_delayed_refs(trans, root, 0);
1332 1333
	if (ret)
		goto cleanup_transaction;
1334

1335 1336 1337
	btrfs_trans_release_metadata(trans, root);
	trans->block_rsv = NULL;

1338
	cur_trans = trans->transaction;
1339

1340 1341 1342 1343
	/*
	 * set the flushing flag so procs in this transaction have to
	 * start sending their work down.
	 */
1344
	cur_trans->delayed_refs.flushing = 1;
1345

1346
	ret = btrfs_run_delayed_refs(trans, root, 0);
1347 1348
	if (ret)
		goto cleanup_transaction;
1349

J
Josef Bacik 已提交
1350
	spin_lock(&cur_trans->commit_lock);
1351
	if (cur_trans->in_commit) {
J
Josef Bacik 已提交
1352
		spin_unlock(&cur_trans->commit_lock);
1353
		atomic_inc(&cur_trans->use_count);
1354
		ret = btrfs_end_transaction(trans, root);
C
Chris Mason 已提交
1355

1356
		wait_for_commit(root, cur_trans);
1357

C
Chris Mason 已提交
1358
		put_transaction(cur_trans);
1359

1360
		return ret;
C
Chris Mason 已提交
1361
	}
1362

C
Chris Mason 已提交
1363
	trans->transaction->in_commit = 1;
1364
	trans->transaction->blocked = 1;
J
Josef Bacik 已提交
1365
	spin_unlock(&cur_trans->commit_lock);
S
Sage Weil 已提交
1366 1367
	wake_up(&root->fs_info->transaction_blocked_wait);

J
Josef Bacik 已提交
1368
	spin_lock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1369 1370 1371 1372
	if (cur_trans->list.prev != &root->fs_info->trans_list) {
		prev_trans = list_entry(cur_trans->list.prev,
					struct btrfs_transaction, list);
		if (!prev_trans->commit_done) {
1373
			atomic_inc(&prev_trans->use_count);
J
Josef Bacik 已提交
1374
			spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1375 1376 1377

			wait_for_commit(root, prev_trans);

1378
			put_transaction(prev_trans);
J
Josef Bacik 已提交
1379 1380
		} else {
			spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1381
		}
J
Josef Bacik 已提交
1382 1383
	} else {
		spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1384
	}
1385

1386 1387
	if (!btrfs_test_opt(root, SSD) &&
	    (now < cur_trans->start_time || now - cur_trans->start_time < 1))
1388 1389
		should_grow = 1;

1390
	do {
1391
		int snap_pending = 0;
J
Josef Bacik 已提交
1392

1393
		joined = cur_trans->num_joined;
1394 1395 1396
		if (!list_empty(&trans->transaction->pending_snapshots))
			snap_pending = 1;

C
Chris Mason 已提交
1397
		WARN_ON(cur_trans != trans->transaction);
1398

1399
		if (flush_on_commit || snap_pending) {
Y
Yan, Zheng 已提交
1400
			btrfs_start_delalloc_inodes(root, 1);
1401
			btrfs_wait_ordered_extents(root, 0, 1);
1402 1403
		}

1404
		ret = btrfs_run_delayed_items(trans, root);
1405 1406
		if (ret)
			goto cleanup_transaction;
1407

1408 1409 1410 1411 1412 1413 1414
		/*
		 * running the delayed items may have added new refs. account
		 * them now so that they hinder processing of more delayed refs
		 * as little as possible.
		 */
		btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);

1415 1416 1417 1418 1419 1420 1421 1422 1423
		/*
		 * rename don't use btrfs_join_transaction, so, once we
		 * set the transaction to blocked above, we aren't going
		 * to get any new ordered operations.  We can safely run
		 * it here and no for sure that nothing new will be added
		 * to the list
		 */
		btrfs_run_ordered_operations(root, 1);

1424 1425 1426
		prepare_to_wait(&cur_trans->writer_wait, &wait,
				TASK_UNINTERRUPTIBLE);

1427
		if (atomic_read(&cur_trans->num_writers) > 1)
1428 1429 1430
			schedule_timeout(MAX_SCHEDULE_TIMEOUT);
		else if (should_grow)
			schedule_timeout(1);
1431 1432

		finish_wait(&cur_trans->writer_wait, &wait);
1433
	} while (atomic_read(&cur_trans->num_writers) > 1 ||
1434
		 (should_grow && cur_trans->num_joined != joined));
1435

1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446
	/*
	 * Ok now we need to make sure to block out any other joins while we
	 * commit the transaction.  We could have started a join before setting
	 * no_join so make sure to wait for num_writers to == 1 again.
	 */
	spin_lock(&root->fs_info->trans_lock);
	root->fs_info->trans_no_join = 1;
	spin_unlock(&root->fs_info->trans_lock);
	wait_event(cur_trans->writer_wait,
		   atomic_read(&cur_trans->num_writers) == 1);

C
Chris Mason 已提交
1447 1448 1449 1450 1451 1452 1453
	/*
	 * the reloc mutex makes sure that we stop
	 * the balancing code from coming in and moving
	 * extents around in the middle of the commit
	 */
	mutex_lock(&root->fs_info->reloc_mutex);

1454
	ret = btrfs_run_delayed_items(trans, root);
1455 1456 1457 1458
	if (ret) {
		mutex_unlock(&root->fs_info->reloc_mutex);
		goto cleanup_transaction;
	}
1459

1460
	ret = create_pending_snapshots(trans, root->fs_info);
1461 1462 1463 1464
	if (ret) {
		mutex_unlock(&root->fs_info->reloc_mutex);
		goto cleanup_transaction;
	}
1465

1466
	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1467 1468 1469 1470
	if (ret) {
		mutex_unlock(&root->fs_info->reloc_mutex);
		goto cleanup_transaction;
	}
1471

1472 1473 1474 1475 1476 1477
	/*
	 * make sure none of the code above managed to slip in a
	 * delayed item
	 */
	btrfs_assert_delayed_root_empty(root);

C
Chris Mason 已提交
1478
	WARN_ON(cur_trans != trans->transaction);
C
Chris Mason 已提交
1479

A
Arne Jansen 已提交
1480
	btrfs_scrub_pause(root);
1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495
	/* btrfs_commit_tree_roots is responsible for getting the
	 * various roots consistent with each other.  Every pointer
	 * in the tree of tree roots has to point to the most up to date
	 * root for every subvolume and other tree.  So, we have to keep
	 * the tree logging code from jumping in and changing any
	 * of the trees.
	 *
	 * At this point in the commit, there can't be any tree-log
	 * writers, but a little lower down we drop the trans mutex
	 * and let new people in.  By holding the tree_log_mutex
	 * from now until after the super is written, we avoid races
	 * with the tree-log code.
	 */
	mutex_lock(&root->fs_info->tree_log_mutex);

1496
	ret = commit_fs_roots(trans, root);
1497 1498
	if (ret) {
		mutex_unlock(&root->fs_info->tree_log_mutex);
1499
		mutex_unlock(&root->fs_info->reloc_mutex);
1500 1501
		goto cleanup_transaction;
	}
1502

1503
	/* commit_fs_roots gets rid of all the tree log roots, it is now
1504 1505 1506 1507
	 * safe to free the root of tree log roots
	 */
	btrfs_free_log_root_tree(trans, root->fs_info);

1508
	ret = commit_cowonly_roots(trans, root);
1509 1510
	if (ret) {
		mutex_unlock(&root->fs_info->tree_log_mutex);
1511
		mutex_unlock(&root->fs_info->reloc_mutex);
1512 1513
		goto cleanup_transaction;
	}
1514

1515 1516
	btrfs_prepare_extent_commit(trans, root);

C
Chris Mason 已提交
1517
	cur_trans = root->fs_info->running_transaction;
1518 1519 1520

	btrfs_set_root_node(&root->fs_info->tree_root->root_item,
			    root->fs_info->tree_root->node);
J
Josef Bacik 已提交
1521
	switch_commit_root(root->fs_info->tree_root);
1522 1523 1524

	btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
			    root->fs_info->chunk_root->node);
J
Josef Bacik 已提交
1525
	switch_commit_root(root->fs_info->chunk_root);
1526

1527
	assert_qgroups_uptodate(trans);
1528
	update_super_roots(root);
1529 1530

	if (!root->fs_info->log_root_recovering) {
1531 1532
		btrfs_set_super_log_root(root->fs_info->super_copy, 0);
		btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
1533 1534
	}

1535 1536
	memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
	       sizeof(*root->fs_info->super_copy));
C
Chris Mason 已提交
1537

1538
	trans->transaction->blocked = 0;
J
Josef Bacik 已提交
1539 1540 1541 1542
	spin_lock(&root->fs_info->trans_lock);
	root->fs_info->running_transaction = NULL;
	root->fs_info->trans_no_join = 0;
	spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1543
	mutex_unlock(&root->fs_info->reloc_mutex);
1544

1545
	wake_up(&root->fs_info->transaction_wait);
1546

C
Chris Mason 已提交
1547
	ret = btrfs_write_and_wait_transaction(trans, root);
1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559
	if (ret) {
		btrfs_error(root->fs_info, ret,
			    "Error while writing out transaction.");
		mutex_unlock(&root->fs_info->tree_log_mutex);
		goto cleanup_transaction;
	}

	ret = write_ctree_super(trans, root, 0);
	if (ret) {
		mutex_unlock(&root->fs_info->tree_log_mutex);
		goto cleanup_transaction;
	}
1560

1561 1562 1563 1564 1565 1566
	/*
	 * the super is written, we can safely allow the tree-loggers
	 * to go about their business
	 */
	mutex_unlock(&root->fs_info->tree_log_mutex);

1567
	btrfs_finish_extent_commit(trans, root);
1568

C
Chris Mason 已提交
1569
	cur_trans->commit_done = 1;
1570

1571
	root->fs_info->last_trans_committed = cur_trans->transid;
J
Josef Bacik 已提交
1572

C
Chris Mason 已提交
1573
	wake_up(&cur_trans->commit_wait);
1574

J
Josef Bacik 已提交
1575
	spin_lock(&root->fs_info->trans_lock);
1576
	list_del_init(&cur_trans->list);
J
Josef Bacik 已提交
1577 1578
	spin_unlock(&root->fs_info->trans_lock);

C
Chris Mason 已提交
1579
	put_transaction(cur_trans);
C
Chris Mason 已提交
1580
	put_transaction(cur_trans);
1581

1582 1583
	trace_btrfs_transaction_commit(root);

A
Arne Jansen 已提交
1584 1585
	btrfs_scrub_continue(root);

J
Josef Bacik 已提交
1586 1587 1588
	if (current->journal_info == trans)
		current->journal_info = NULL;

C
Chris Mason 已提交
1589
	kmem_cache_free(btrfs_trans_handle_cachep, trans);
Y
Yan, Zheng 已提交
1590 1591 1592 1593

	if (current != root->fs_info->transaction_kthread)
		btrfs_run_delayed_iputs(root);

C
Chris Mason 已提交
1594
	return ret;
1595 1596

cleanup_transaction:
1597 1598
	btrfs_trans_release_metadata(trans, root);
	trans->block_rsv = NULL;
1599 1600 1601 1602
	btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n");
//	WARN_ON(1);
	if (current->journal_info == trans)
		current->journal_info = NULL;
1603
	cleanup_transaction(trans, root, ret);
1604 1605

	return ret;
C
Chris Mason 已提交
1606 1607
}

C
Chris Mason 已提交
1608 1609 1610
/*
 * interface function to delete all the snapshots we have scheduled for deletion
 */
1611 1612
int btrfs_clean_old_snapshots(struct btrfs_root *root)
{
1613 1614 1615
	LIST_HEAD(list);
	struct btrfs_fs_info *fs_info = root->fs_info;

J
Josef Bacik 已提交
1616
	spin_lock(&fs_info->trans_lock);
1617
	list_splice_init(&fs_info->dead_roots, &list);
J
Josef Bacik 已提交
1618
	spin_unlock(&fs_info->trans_lock);
1619

1620
	while (!list_empty(&list)) {
1621 1622
		int ret;

1623
		root = list_entry(list.next, struct btrfs_root, root_list);
1624 1625
		list_del(&root->root_list);

1626 1627
		btrfs_kill_all_delayed_nodes(root);

1628 1629
		if (btrfs_header_backref_rev(root->node) <
		    BTRFS_MIXED_BACKREF_REV)
1630
			ret = btrfs_drop_snapshot(root, NULL, 0, 0);
1631
		else
1632 1633
			ret =btrfs_drop_snapshot(root, NULL, 1, 0);
		BUG_ON(ret < 0);
1634 1635 1636
	}
	return 0;
}