transaction.c 42.3 KB
Newer Older
C
Chris Mason 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

C
Chris Mason 已提交
19
#include <linux/fs.h>
20
#include <linux/slab.h>
C
Chris Mason 已提交
21
#include <linux/sched.h>
22
#include <linux/writeback.h>
23
#include <linux/pagemap.h>
24
#include <linux/blkdev.h>
25
#include <linux/uuid.h>
C
Chris Mason 已提交
26 27 28
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
29
#include "locking.h"
30
#include "tree-log.h"
31
#include "inode-map.h"
32
#include "volumes.h"
C
Chris Mason 已提交
33

34 35
#define BTRFS_ROOT_TRANS_TAG 0

36
void put_transaction(struct btrfs_transaction *transaction)
C
Chris Mason 已提交
37
{
38 39
	WARN_ON(atomic_read(&transaction->use_count) == 0);
	if (atomic_dec_and_test(&transaction->use_count)) {
J
Josef Bacik 已提交
40
		BUG_ON(!list_empty(&transaction->list));
41
		WARN_ON(transaction->delayed_refs.root.rb_node);
C
Chris Mason 已提交
42 43
		memset(transaction, 0, sizeof(*transaction));
		kmem_cache_free(btrfs_transaction_cachep, transaction);
C
Chris Mason 已提交
44
	}
C
Chris Mason 已提交
45 46
}

J
Josef Bacik 已提交
47 48 49 50 51 52
static noinline void switch_commit_root(struct btrfs_root *root)
{
	free_extent_buffer(root->commit_root);
	root->commit_root = btrfs_root_node(root);
}

C
Chris Mason 已提交
53 54 55
/*
 * either allocate a new transaction or hop into the existing one
 */
J
Josef Bacik 已提交
56
static noinline int join_transaction(struct btrfs_root *root, int nofail)
C
Chris Mason 已提交
57 58
{
	struct btrfs_transaction *cur_trans;
59
	struct btrfs_fs_info *fs_info = root->fs_info;
J
Josef Bacik 已提交
60

61
	spin_lock(&fs_info->trans_lock);
62
loop:
63
	/* The file system has been taken offline. No new transactions. */
64 65
	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
		spin_unlock(&fs_info->trans_lock);
66 67 68
		return -EROFS;
	}

69
	if (fs_info->trans_no_join) {
J
Josef Bacik 已提交
70
		if (!nofail) {
71
			spin_unlock(&fs_info->trans_lock);
J
Josef Bacik 已提交
72 73 74 75
			return -EBUSY;
		}
	}

76
	cur_trans = fs_info->running_transaction;
J
Josef Bacik 已提交
77
	if (cur_trans) {
78
		if (cur_trans->aborted) {
79
			spin_unlock(&fs_info->trans_lock);
80
			return cur_trans->aborted;
81
		}
J
Josef Bacik 已提交
82
		atomic_inc(&cur_trans->use_count);
83
		atomic_inc(&cur_trans->num_writers);
84
		cur_trans->num_joined++;
85
		spin_unlock(&fs_info->trans_lock);
J
Josef Bacik 已提交
86
		return 0;
C
Chris Mason 已提交
87
	}
88
	spin_unlock(&fs_info->trans_lock);
J
Josef Bacik 已提交
89 90 91 92

	cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
	if (!cur_trans)
		return -ENOMEM;
93

94 95
	spin_lock(&fs_info->trans_lock);
	if (fs_info->running_transaction) {
96 97 98 99
		/*
		 * someone started a transaction after we unlocked.  Make sure
		 * to redo the trans_no_join checks above
		 */
J
Josef Bacik 已提交
100
		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
101
		cur_trans = fs_info->running_transaction;
102
		goto loop;
103 104
	} else if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
		spin_unlock(&fs_info->trans_lock);
105 106
		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
		return -EROFS;
C
Chris Mason 已提交
107
	}
108

J
Josef Bacik 已提交
109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
	atomic_set(&cur_trans->num_writers, 1);
	cur_trans->num_joined = 0;
	init_waitqueue_head(&cur_trans->writer_wait);
	init_waitqueue_head(&cur_trans->commit_wait);
	cur_trans->in_commit = 0;
	cur_trans->blocked = 0;
	/*
	 * One for this trans handle, one so it will live on until we
	 * commit the transaction.
	 */
	atomic_set(&cur_trans->use_count, 2);
	cur_trans->commit_done = 0;
	cur_trans->start_time = get_seconds();

	cur_trans->delayed_refs.root = RB_ROOT;
	cur_trans->delayed_refs.num_entries = 0;
	cur_trans->delayed_refs.num_heads_ready = 0;
	cur_trans->delayed_refs.num_heads = 0;
	cur_trans->delayed_refs.flushing = 0;
	cur_trans->delayed_refs.run_delayed_start = 0;
129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146

	/*
	 * although the tree mod log is per file system and not per transaction,
	 * the log must never go across transaction boundaries.
	 */
	smp_mb();
	if (!list_empty(&fs_info->tree_mod_seq_list)) {
		printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when "
			"creating a fresh transaction\n");
		WARN_ON(1);
	}
	if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) {
		printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
			"creating a fresh transaction\n");
		WARN_ON(1);
	}
	atomic_set(&fs_info->tree_mod_seq, 0);

J
Josef Bacik 已提交
147 148 149 150
	spin_lock_init(&cur_trans->commit_lock);
	spin_lock_init(&cur_trans->delayed_refs.lock);

	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
151
	list_add_tail(&cur_trans->list, &fs_info->trans_list);
J
Josef Bacik 已提交
152
	extent_io_tree_init(&cur_trans->dirty_pages,
153 154 155 156
			     fs_info->btree_inode->i_mapping);
	fs_info->generation++;
	cur_trans->transid = fs_info->generation;
	fs_info->running_transaction = cur_trans;
157
	cur_trans->aborted = 0;
158
	spin_unlock(&fs_info->trans_lock);
159

C
Chris Mason 已提交
160 161 162
	return 0;
}

C
Chris Mason 已提交
163
/*
C
Chris Mason 已提交
164 165 166 167
 * this does all the record keeping required to make sure that a reference
 * counted root is properly recorded in a given transaction.  This is required
 * to make sure the old root from before we joined the transaction is deleted
 * when the transaction commits
C
Chris Mason 已提交
168
 */
C
Chris Mason 已提交
169
static int record_root_in_trans(struct btrfs_trans_handle *trans,
J
Josef Bacik 已提交
170
			       struct btrfs_root *root)
171
{
172
	if (root->ref_cows && root->last_trans < trans->transid) {
173
		WARN_ON(root == root->fs_info->extent_root);
174 175
		WARN_ON(root->commit_root != root->node);

C
Chris Mason 已提交
176 177 178 179 180 181 182 183 184 185 186 187
		/*
		 * see below for in_trans_setup usage rules
		 * we have the reloc mutex held now, so there
		 * is only one writer in this function
		 */
		root->in_trans_setup = 1;

		/* make sure readers find in_trans_setup before
		 * they find our root->last_trans update
		 */
		smp_wmb();

J
Josef Bacik 已提交
188 189 190 191 192
		spin_lock(&root->fs_info->fs_roots_radix_lock);
		if (root->last_trans == trans->transid) {
			spin_unlock(&root->fs_info->fs_roots_radix_lock);
			return 0;
		}
193 194 195
		radix_tree_tag_set(&root->fs_info->fs_roots_radix,
			   (unsigned long)root->root_key.objectid,
			   BTRFS_ROOT_TRANS_TAG);
J
Josef Bacik 已提交
196
		spin_unlock(&root->fs_info->fs_roots_radix_lock);
C
Chris Mason 已提交
197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
		root->last_trans = trans->transid;

		/* this is pretty tricky.  We don't want to
		 * take the relocation lock in btrfs_record_root_in_trans
		 * unless we're really doing the first setup for this root in
		 * this transaction.
		 *
		 * Normally we'd use root->last_trans as a flag to decide
		 * if we want to take the expensive mutex.
		 *
		 * But, we have to set root->last_trans before we
		 * init the relocation root, otherwise, we trip over warnings
		 * in ctree.c.  The solution used here is to flag ourselves
		 * with root->in_trans_setup.  When this is 1, we're still
		 * fixing up the reloc trees and everyone must wait.
		 *
		 * When this is zero, they can trust root->last_trans and fly
		 * through btrfs_record_root_in_trans without having to take the
		 * lock.  smp_wmb() makes sure that all the writes above are
		 * done before we pop in the zero below
		 */
218
		btrfs_init_reloc_root(trans, root);
C
Chris Mason 已提交
219 220
		smp_wmb();
		root->in_trans_setup = 0;
221 222 223
	}
	return 0;
}
224

C
Chris Mason 已提交
225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247

int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root)
{
	if (!root->ref_cows)
		return 0;

	/*
	 * see record_root_in_trans for comments about in_trans_setup usage
	 * and barriers
	 */
	smp_rmb();
	if (root->last_trans == trans->transid &&
	    !root->in_trans_setup)
		return 0;

	mutex_lock(&root->fs_info->reloc_mutex);
	record_root_in_trans(trans, root);
	mutex_unlock(&root->fs_info->reloc_mutex);

	return 0;
}

C
Chris Mason 已提交
248 249 250 251
/* wait for commit against the current transaction to become unblocked
 * when this is done, it is safe to start a new transaction, but the current
 * transaction might not be fully on disk.
 */
C
Chris Mason 已提交
252
static void wait_current_trans(struct btrfs_root *root)
C
Chris Mason 已提交
253
{
254
	struct btrfs_transaction *cur_trans;
C
Chris Mason 已提交
255

J
Josef Bacik 已提交
256
	spin_lock(&root->fs_info->trans_lock);
257
	cur_trans = root->fs_info->running_transaction;
C
Chris Mason 已提交
258
	if (cur_trans && cur_trans->blocked) {
259
		atomic_inc(&cur_trans->use_count);
J
Josef Bacik 已提交
260
		spin_unlock(&root->fs_info->trans_lock);
L
Li Zefan 已提交
261 262 263

		wait_event(root->fs_info->transaction_wait,
			   !cur_trans->blocked);
264
		put_transaction(cur_trans);
J
Josef Bacik 已提交
265 266
	} else {
		spin_unlock(&root->fs_info->trans_lock);
267
	}
C
Chris Mason 已提交
268 269
}

270 271 272 273
enum btrfs_trans_type {
	TRANS_START,
	TRANS_JOIN,
	TRANS_USERSPACE,
274
	TRANS_JOIN_NOLOCK,
275 276
};

277 278
static int may_wait_transaction(struct btrfs_root *root, int type)
{
J
Josef Bacik 已提交
279 280 281 282 283 284 285 286
	if (root->fs_info->log_root_recovering)
		return 0;

	if (type == TRANS_USERSPACE)
		return 1;

	if (type == TRANS_START &&
	    !atomic_read(&root->fs_info->open_ioctl_trans))
287
		return 1;
J
Josef Bacik 已提交
288

289 290 291
	return 0;
}

292
static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
293
						    u64 num_items, int type)
C
Chris Mason 已提交
294
{
295 296
	struct btrfs_trans_handle *h;
	struct btrfs_transaction *cur_trans;
297
	u64 num_bytes = 0;
C
Chris Mason 已提交
298
	int ret;
299
	u64 qgroup_reserved = 0;
L
liubo 已提交
300 301 302

	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
		return ERR_PTR(-EROFS);
303 304 305 306 307 308 309 310 311

	if (current->journal_info) {
		WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
		h = current->journal_info;
		h->use_count++;
		h->orig_rsv = h->block_rsv;
		h->block_rsv = NULL;
		goto got_it;
	}
312 313 314 315 316 317

	/*
	 * Do the reservation before we join the transaction so we can do all
	 * the appropriate flushing if need be.
	 */
	if (num_items > 0 && root != root->fs_info->chunk_root) {
318 319 320 321 322 323 324 325
		if (root->fs_info->quota_enabled &&
		    is_fstree(root->root_key.objectid)) {
			qgroup_reserved = num_items * root->leafsize;
			ret = btrfs_qgroup_reserve(root, qgroup_reserved);
			if (ret)
				return ERR_PTR(ret);
		}

326
		num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
327
		ret = btrfs_block_rsv_add(root,
328 329 330 331 332
					  &root->fs_info->trans_block_rsv,
					  num_bytes);
		if (ret)
			return ERR_PTR(ret);
	}
333 334 335 336
again:
	h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
	if (!h)
		return ERR_PTR(-ENOMEM);
C
Chris Mason 已提交
337

338
	if (may_wait_transaction(root, type))
C
Chris Mason 已提交
339
		wait_current_trans(root);
340

J
Josef Bacik 已提交
341 342 343 344 345 346
	do {
		ret = join_transaction(root, type == TRANS_JOIN_NOLOCK);
		if (ret == -EBUSY)
			wait_current_trans(root);
	} while (ret == -EBUSY);

T
Tsutomu Itoh 已提交
347
	if (ret < 0) {
348
		kmem_cache_free(btrfs_trans_handle_cachep, h);
T
Tsutomu Itoh 已提交
349 350
		return ERR_PTR(ret);
	}
351

352 353 354 355
	cur_trans = root->fs_info->running_transaction;

	h->transid = cur_trans->transid;
	h->transaction = cur_trans;
C
Chris Mason 已提交
356
	h->blocks_used = 0;
357
	h->bytes_reserved = 0;
358
	h->root = root;
359
	h->delayed_ref_updates = 0;
360
	h->use_count = 1;
361
	h->adding_csums = 0;
362
	h->block_rsv = NULL;
363
	h->orig_rsv = NULL;
364
	h->aborted = 0;
365
	h->qgroup_reserved = qgroup_reserved;
366 367
	h->delayed_ref_elem.seq = 0;
	INIT_LIST_HEAD(&h->qgroup_ref_list);
368

369 370 371 372 373 374
	smp_mb();
	if (cur_trans->blocked && may_wait_transaction(root, type)) {
		btrfs_commit_transaction(h, root);
		goto again;
	}

375
	if (num_bytes) {
J
Josef Bacik 已提交
376
		trace_btrfs_space_reservation(root->fs_info, "transaction",
377
					      h->transid, num_bytes, 1);
378 379
		h->block_rsv = &root->fs_info->trans_block_rsv;
		h->bytes_reserved = num_bytes;
380
	}
J
Josef Bacik 已提交
381

382
got_it:
J
Josef Bacik 已提交
383
	btrfs_record_root_in_trans(h, root);
384 385 386

	if (!current->journal_info && type != TRANS_USERSPACE)
		current->journal_info = h;
C
Chris Mason 已提交
387 388 389
	return h;
}

390
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
391
						   int num_items)
392
{
393
	return start_transaction(root, num_items, TRANS_START);
394
}
395
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
396
{
397
	return start_transaction(root, 0, TRANS_JOIN);
398 399
}

400
struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
401 402 403 404
{
	return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
}

405
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
406
{
407
	return start_transaction(root, 0, TRANS_USERSPACE);
408 409
}

C
Chris Mason 已提交
410
/* wait for a transaction commit to be fully complete */
411
static noinline void wait_for_commit(struct btrfs_root *root,
412 413
				    struct btrfs_transaction *commit)
{
L
Li Zefan 已提交
414
	wait_event(commit->commit_wait, commit->commit_done);
415 416
}

417 418 419 420 421 422 423 424
int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
{
	struct btrfs_transaction *cur_trans = NULL, *t;
	int ret;

	ret = 0;
	if (transid) {
		if (transid <= root->fs_info->last_trans_committed)
J
Josef Bacik 已提交
425
			goto out;
426 427

		/* find specified transaction */
J
Josef Bacik 已提交
428
		spin_lock(&root->fs_info->trans_lock);
429 430 431
		list_for_each_entry(t, &root->fs_info->trans_list, list) {
			if (t->transid == transid) {
				cur_trans = t;
J
Josef Bacik 已提交
432
				atomic_inc(&cur_trans->use_count);
433 434 435 436 437
				break;
			}
			if (t->transid > transid)
				break;
		}
J
Josef Bacik 已提交
438
		spin_unlock(&root->fs_info->trans_lock);
439 440
		ret = -EINVAL;
		if (!cur_trans)
J
Josef Bacik 已提交
441
			goto out;  /* bad transid */
442 443
	} else {
		/* find newest transaction that is committing | committed */
J
Josef Bacik 已提交
444
		spin_lock(&root->fs_info->trans_lock);
445 446 447 448
		list_for_each_entry_reverse(t, &root->fs_info->trans_list,
					    list) {
			if (t->in_commit) {
				if (t->commit_done)
449
					break;
450
				cur_trans = t;
J
Josef Bacik 已提交
451
				atomic_inc(&cur_trans->use_count);
452 453 454
				break;
			}
		}
J
Josef Bacik 已提交
455
		spin_unlock(&root->fs_info->trans_lock);
456
		if (!cur_trans)
J
Josef Bacik 已提交
457
			goto out;  /* nothing committing|committed */
458 459 460 461 462 463
	}

	wait_for_commit(root, cur_trans);

	put_transaction(cur_trans);
	ret = 0;
J
Josef Bacik 已提交
464
out:
465 466 467
	return ret;
}

C
Chris Mason 已提交
468 469
void btrfs_throttle(struct btrfs_root *root)
{
J
Josef Bacik 已提交
470
	if (!atomic_read(&root->fs_info->open_ioctl_trans))
471
		wait_current_trans(root);
C
Chris Mason 已提交
472 473
}

474 475 476 477
static int should_end_transaction(struct btrfs_trans_handle *trans,
				  struct btrfs_root *root)
{
	int ret;
478 479

	ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
480 481 482 483 484 485 486 487
	return ret ? 1 : 0;
}

int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root)
{
	struct btrfs_transaction *cur_trans = trans->transaction;
	int updates;
488
	int err;
489

J
Josef Bacik 已提交
490
	smp_mb();
491 492 493 494 495
	if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
		return 1;

	updates = trans->delayed_ref_updates;
	trans->delayed_ref_updates = 0;
496 497 498 499 500
	if (updates) {
		err = btrfs_run_delayed_refs(trans, root, updates);
		if (err) /* Error code will also eval true */
			return err;
	}
501 502 503 504

	return should_end_transaction(trans, root);
}

505
static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
506
			  struct btrfs_root *root, int throttle, int lock)
C
Chris Mason 已提交
507
{
508
	struct btrfs_transaction *cur_trans = trans->transaction;
509
	struct btrfs_fs_info *info = root->fs_info;
510
	int count = 0;
511
	int err = 0;
512

513 514 515 516 517
	if (--trans->use_count) {
		trans->block_rsv = trans->orig_rsv;
		return 0;
	}

518 519 520 521 522
	/*
	 * do the qgroup accounting as early as possible
	 */
	err = btrfs_delayed_refs_qgroup_accounting(trans, info);

523
	btrfs_trans_release_metadata(trans, root);
524
	trans->block_rsv = NULL;
525 526 527 528 529
	/*
	 * the same root has to be passed to start_transaction and
	 * end_transaction. Subvolume quota depends on this.
	 */
	WARN_ON(trans->root != root);
530 531 532 533 534 535

	if (trans->qgroup_reserved) {
		btrfs_qgroup_free(root, trans->qgroup_reserved);
		trans->qgroup_reserved = 0;
	}

536
	while (count < 2) {
537 538 539 540 541 542 543 544 545 546
		unsigned long cur = trans->delayed_ref_updates;
		trans->delayed_ref_updates = 0;
		if (cur &&
		    trans->transaction->delayed_refs.num_heads_ready > 64) {
			trans->delayed_ref_updates = 0;
			btrfs_run_delayed_refs(trans, root, cur);
		} else {
			break;
		}
		count++;
547
	}
548 549
	btrfs_trans_release_metadata(trans, root);
	trans->block_rsv = NULL;
550

J
Josef Bacik 已提交
551 552
	if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
	    should_end_transaction(trans, root)) {
553
		trans->transaction->blocked = 1;
J
Josef Bacik 已提交
554 555
		smp_wmb();
	}
556

557
	if (lock && cur_trans->blocked && !cur_trans->in_commit) {
558 559 560 561 562 563 564
		if (throttle) {
			/*
			 * We may race with somebody else here so end up having
			 * to call end_transaction on ourselves again, so inc
			 * our use_count.
			 */
			trans->use_count++;
565
			return btrfs_commit_transaction(trans, root);
566
		} else {
567
			wake_up_process(info->transaction_kthread);
568
		}
569 570 571
	}

	WARN_ON(cur_trans != info->running_transaction);
572 573
	WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
	atomic_dec(&cur_trans->num_writers);
574

575
	smp_mb();
C
Chris Mason 已提交
576 577 578
	if (waitqueue_active(&cur_trans->writer_wait))
		wake_up(&cur_trans->writer_wait);
	put_transaction(cur_trans);
J
Josef Bacik 已提交
579 580 581

	if (current->journal_info == trans)
		current->journal_info = NULL;
582

Y
Yan, Zheng 已提交
583 584 585
	if (throttle)
		btrfs_run_delayed_iputs(root);

586 587
	if (trans->aborted ||
	    root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
588
		err = -EIO;
589
	}
590
	assert_qgroups_uptodate(trans);
591

592 593 594
	memset(trans, 0, sizeof(*trans));
	kmem_cache_free(btrfs_trans_handle_cachep, trans);
	return err;
C
Chris Mason 已提交
595 596
}

597 598 599
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root)
{
600 601 602 603 604 605
	int ret;

	ret = __btrfs_end_transaction(trans, root, 0, 1);
	if (ret)
		return ret;
	return 0;
606 607 608 609 610
}

int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root)
{
611 612 613 614 615 616
	int ret;

	ret = __btrfs_end_transaction(trans, root, 1, 1);
	if (ret)
		return ret;
	return 0;
617 618 619 620 621
}

int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root)
{
622 623 624 625 626 627 628 629 630 631 632 633
	int ret;

	ret = __btrfs_end_transaction(trans, root, 0, 0);
	if (ret)
		return ret;
	return 0;
}

int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
				struct btrfs_root *root)
{
	return __btrfs_end_transaction(trans, root, 1, 1);
634 635
}

C
Chris Mason 已提交
636 637 638
/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
639
 * those extents are sent to disk but does not wait on them
C
Chris Mason 已提交
640
 */
641
int btrfs_write_marked_extents(struct btrfs_root *root,
642
			       struct extent_io_tree *dirty_pages, int mark)
C
Chris Mason 已提交
643
{
644
	int err = 0;
645
	int werr = 0;
J
Josef Bacik 已提交
646
	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
647
	u64 start = 0;
648
	u64 end;
649

J
Josef Bacik 已提交
650 651 652 653 654 655 656 657 658
	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
				      mark)) {
		convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
				   GFP_NOFS);
		err = filemap_fdatawrite_range(mapping, start, end);
		if (err)
			werr = err;
		cond_resched();
		start = end + 1;
659
	}
660 661 662 663 664 665 666 667 668 669 670 671
	if (err)
		werr = err;
	return werr;
}

/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
 * those extents are on disk for transaction or log commit.  We wait
 * on all the pages and clear them from the dirty pages state tree
 */
int btrfs_wait_marked_extents(struct btrfs_root *root,
672
			      struct extent_io_tree *dirty_pages, int mark)
673 674 675
{
	int err = 0;
	int werr = 0;
J
Josef Bacik 已提交
676
	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
677 678
	u64 start = 0;
	u64 end;
679

J
Josef Bacik 已提交
680 681 682 683 684 685 686 687
	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
				      EXTENT_NEED_WAIT)) {
		clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
		err = filemap_fdatawait_range(mapping, start, end);
		if (err)
			werr = err;
		cond_resched();
		start = end + 1;
688
	}
689 690 691
	if (err)
		werr = err;
	return werr;
C
Chris Mason 已提交
692 693
}

694 695 696 697 698 699
/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
 * those extents are on disk for transaction or log commit
 */
int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
700
				struct extent_io_tree *dirty_pages, int mark)
701 702 703 704
{
	int ret;
	int ret2;

705 706
	ret = btrfs_write_marked_extents(root, dirty_pages, mark);
	ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
707 708 709 710 711 712

	if (ret)
		return ret;
	if (ret2)
		return ret2;
	return 0;
713 714
}

715 716 717 718 719 720 721 722 723
int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
				     struct btrfs_root *root)
{
	if (!trans || !trans->transaction) {
		struct inode *btree_inode;
		btree_inode = root->fs_info->btree_inode;
		return filemap_write_and_wait(btree_inode->i_mapping);
	}
	return btrfs_write_and_wait_marked_extents(root,
724 725
					   &trans->transaction->dirty_pages,
					   EXTENT_DIRTY);
726 727
}

C
Chris Mason 已提交
728 729 730 731 732 733 734 735 736 737
/*
 * this is used to update the root pointer in the tree of tree roots.
 *
 * But, in the case of the extent allocation tree, updating the root
 * pointer may allocate blocks which may change the root of the extent
 * allocation tree.
 *
 * So, this loops and repeats and makes sure the cowonly root didn't
 * change while the root pointer was being updated in the metadata.
 */
738 739
static int update_cowonly_root(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root)
C
Chris Mason 已提交
740 741
{
	int ret;
742
	u64 old_root_bytenr;
743
	u64 old_root_used;
744
	struct btrfs_root *tree_root = root->fs_info->tree_root;
C
Chris Mason 已提交
745

746
	old_root_used = btrfs_root_used(&root->root_item);
747
	btrfs_write_dirty_block_groups(trans, root);
748

C
Chris Mason 已提交
749
	while (1) {
750
		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
751 752
		if (old_root_bytenr == root->node->start &&
		    old_root_used == btrfs_root_used(&root->root_item))
C
Chris Mason 已提交
753
			break;
754

755
		btrfs_set_root_node(&root->root_item, root->node);
C
Chris Mason 已提交
756
		ret = btrfs_update_root(trans, tree_root,
757 758
					&root->root_key,
					&root->root_item);
759 760
		if (ret)
			return ret;
761

762
		old_root_used = btrfs_root_used(&root->root_item);
763
		ret = btrfs_write_dirty_block_groups(trans, root);
764 765
		if (ret)
			return ret;
766
	}
767 768 769 770

	if (root != root->fs_info->extent_root)
		switch_commit_root(root);

771 772 773
	return 0;
}

C
Chris Mason 已提交
774 775
/*
 * update all the cowonly tree roots on disk
776 777 778 779
 *
 * The error handling in this function may not be obvious. Any of the
 * failures will cause the file system to go offline. We still need
 * to clean up the delayed refs.
C
Chris Mason 已提交
780
 */
781 782
static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
					 struct btrfs_root *root)
783 784 785
{
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct list_head *next;
786
	struct extent_buffer *eb;
787
	int ret;
788

789
	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
790 791
	if (ret)
		return ret;
792

793
	eb = btrfs_lock_root_node(fs_info->tree_root);
794 795
	ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
			      0, &eb);
796 797
	btrfs_tree_unlock(eb);
	free_extent_buffer(eb);
798

799 800 801
	if (ret)
		return ret;

802
	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
803 804
	if (ret)
		return ret;
805

806 807 808
	ret = btrfs_run_dev_stats(trans, root->fs_info);
	BUG_ON(ret);

809 810 811 812 813 814 815
	ret = btrfs_run_qgroups(trans, root->fs_info);
	BUG_ON(ret);

	/* run_qgroups might have added some more refs */
	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
	BUG_ON(ret);

C
Chris Mason 已提交
816
	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
817 818 819
		next = fs_info->dirty_cowonly_roots.next;
		list_del_init(next);
		root = list_entry(next, struct btrfs_root, dirty_list);
820

821 822 823
		ret = update_cowonly_root(trans, root);
		if (ret)
			return ret;
C
Chris Mason 已提交
824
	}
825 826 827 828 829

	down_write(&fs_info->extent_commit_sem);
	switch_commit_root(fs_info->extent_root);
	up_write(&fs_info->extent_commit_sem);

C
Chris Mason 已提交
830 831 832
	return 0;
}

C
Chris Mason 已提交
833 834 835 836 837
/*
 * dead roots are old snapshots that need to be deleted.  This allocates
 * a dirty root struct and adds it into the list of dead roots that need to
 * be deleted
 */
838
int btrfs_add_dead_root(struct btrfs_root *root)
839
{
J
Josef Bacik 已提交
840
	spin_lock(&root->fs_info->trans_lock);
841
	list_add(&root->root_list, &root->fs_info->dead_roots);
J
Josef Bacik 已提交
842
	spin_unlock(&root->fs_info->trans_lock);
843 844 845
	return 0;
}

C
Chris Mason 已提交
846
/*
847
 * update all the cowonly tree roots on disk
C
Chris Mason 已提交
848
 */
849 850
static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
				    struct btrfs_root *root)
851 852
{
	struct btrfs_root *gang[8];
853
	struct btrfs_fs_info *fs_info = root->fs_info;
854 855
	int i;
	int ret;
856 857
	int err = 0;

J
Josef Bacik 已提交
858
	spin_lock(&fs_info->fs_roots_radix_lock);
C
Chris Mason 已提交
859
	while (1) {
860 861
		ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
						 (void **)gang, 0,
862 863 864 865 866 867
						 ARRAY_SIZE(gang),
						 BTRFS_ROOT_TRANS_TAG);
		if (ret == 0)
			break;
		for (i = 0; i < ret; i++) {
			root = gang[i];
868 869 870
			radix_tree_tag_clear(&fs_info->fs_roots_radix,
					(unsigned long)root->root_key.objectid,
					BTRFS_ROOT_TRANS_TAG);
J
Josef Bacik 已提交
871
			spin_unlock(&fs_info->fs_roots_radix_lock);
Y
Yan Zheng 已提交
872

873
			btrfs_free_log(trans, root);
874
			btrfs_update_reloc_root(trans, root);
875
			btrfs_orphan_commit_root(trans, root);
876

877 878
			btrfs_save_ino_cache(root, trans);

879 880 881 882
			/* see comments in should_cow_block() */
			root->force_cow = 0;
			smp_wmb();

883
			if (root->commit_root != root->node) {
884
				mutex_lock(&root->fs_commit_mutex);
J
Josef Bacik 已提交
885
				switch_commit_root(root);
886 887 888
				btrfs_unpin_free_ino(root);
				mutex_unlock(&root->fs_commit_mutex);

889 890 891
				btrfs_set_root_node(&root->root_item,
						    root->node);
			}
892 893

			err = btrfs_update_root(trans, fs_info->tree_root,
894 895
						&root->root_key,
						&root->root_item);
J
Josef Bacik 已提交
896
			spin_lock(&fs_info->fs_roots_radix_lock);
897 898
			if (err)
				break;
899 900
		}
	}
J
Josef Bacik 已提交
901
	spin_unlock(&fs_info->fs_roots_radix_lock);
902
	return err;
903 904
}

C
Chris Mason 已提交
905 906 907 908
/*
 * defrag a given btree.  If cacheonly == 1, this won't read from the disk,
 * otherwise every leaf in the btree is read and defragged.
 */
909 910 911 912
int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
{
	struct btrfs_fs_info *info = root->fs_info;
	struct btrfs_trans_handle *trans;
913
	int ret;
914
	unsigned long nr;
915

916
	if (xchg(&root->defrag_running, 1))
917
		return 0;
918

919
	while (1) {
920 921 922 923
		trans = btrfs_start_transaction(root, 0);
		if (IS_ERR(trans))
			return PTR_ERR(trans);

924
		ret = btrfs_defrag_leaves(trans, root, cacheonly);
925

926
		nr = trans->blocks_used;
927
		btrfs_end_transaction(trans, root);
928
		btrfs_btree_balance_dirty(info->tree_root, nr);
929 930
		cond_resched();

931
		if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
932 933 934
			break;
	}
	root->defrag_running = 0;
935
	return ret;
936 937
}

C
Chris Mason 已提交
938 939 940 941
/*
 * new snapshots need to be created at a very specific time in the
 * transaction commit.  This does the actual creation
 */
942
static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
943 944 945 946
				   struct btrfs_fs_info *fs_info,
				   struct btrfs_pending_snapshot *pending)
{
	struct btrfs_key key;
947
	struct btrfs_root_item *new_root_item;
948 949
	struct btrfs_root *tree_root = fs_info->tree_root;
	struct btrfs_root *root = pending->root;
950
	struct btrfs_root *parent_root;
L
Liu Bo 已提交
951
	struct btrfs_block_rsv *rsv;
952
	struct inode *parent_inode;
953
	struct dentry *parent;
954
	struct dentry *dentry;
955
	struct extent_buffer *tmp;
956
	struct extent_buffer *old;
957
	struct timespec cur_time = CURRENT_TIME;
958
	int ret;
959
	u64 to_reserve = 0;
960
	u64 index = 0;
961
	u64 objectid;
L
Li Zefan 已提交
962
	u64 root_flags;
963
	uuid_le new_uuid;
964

L
Liu Bo 已提交
965 966
	rsv = trans->block_rsv;

967 968
	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
	if (!new_root_item) {
969
		ret = pending->error = -ENOMEM;
970 971
		goto fail;
	}
972

973
	ret = btrfs_find_free_objectid(tree_root, &objectid);
974 975
	if (ret) {
		pending->error = ret;
976
		goto fail;
977
	}
978

979
	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
980 981

	if (to_reserve > 0) {
982 983
		ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv,
						  to_reserve);
984 985 986 987 988 989
		if (ret) {
			pending->error = ret;
			goto fail;
		}
	}

A
Arne Jansen 已提交
990 991 992 993 994 995 996 997
	ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid,
				   objectid, pending->inherit);
	kfree(pending->inherit);
	if (ret) {
		pending->error = ret;
		goto fail;
	}

998
	key.objectid = objectid;
999 1000
	key.offset = (u64)-1;
	key.type = BTRFS_ROOT_ITEM_KEY;
1001

1002
	trans->block_rsv = &pending->block_rsv;
1003

1004
	dentry = pending->dentry;
1005 1006
	parent = dget_parent(dentry);
	parent_inode = parent->d_inode;
1007
	parent_root = BTRFS_I(parent_inode)->root;
C
Chris Mason 已提交
1008
	record_root_in_trans(trans, parent_root);
1009

1010 1011 1012
	/*
	 * insert the directory item
	 */
1013
	ret = btrfs_set_inode_index(parent_inode, &index);
1014
	BUG_ON(ret); /* -ENOMEM */
1015
	ret = btrfs_insert_dir_item(trans, parent_root,
1016
				dentry->d_name.name, dentry->d_name.len,
1017
				parent_inode, &key,
1018
				BTRFS_FT_DIR, index);
1019
	if (ret == -EEXIST) {
1020 1021 1022
		pending->error = -EEXIST;
		dput(parent);
		goto fail;
1023 1024 1025
	} else if (ret) {
		goto abort_trans_dput;
	}
1026

1027 1028
	btrfs_i_size_write(parent_inode, parent_inode->i_size +
					 dentry->d_name.len * 2);
1029
	ret = btrfs_update_inode(trans, parent_root, parent_inode);
1030
	if (ret)
1031
		goto abort_trans_dput;
1032

1033 1034 1035 1036 1037 1038 1039
	/*
	 * pull in the delayed directory update
	 * and the delayed inode item
	 * otherwise we corrupt the FS during
	 * snapshot
	 */
	ret = btrfs_run_delayed_items(trans, root);
1040 1041
	if (ret) { /* Transaction aborted */
		dput(parent);
1042
		goto fail;
1043
	}
1044

C
Chris Mason 已提交
1045
	record_root_in_trans(trans, root);
1046 1047
	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
1048
	btrfs_check_and_init_root_item(new_root_item);
1049

L
Li Zefan 已提交
1050 1051 1052 1053 1054 1055 1056
	root_flags = btrfs_root_flags(new_root_item);
	if (pending->readonly)
		root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
	else
		root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
	btrfs_set_root_flags(new_root_item, root_flags);

1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070
	btrfs_set_root_generation_v2(new_root_item,
			trans->transid);
	uuid_le_gen(&new_uuid);
	memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
	memcpy(new_root_item->parent_uuid, root->root_item.uuid,
			BTRFS_UUID_SIZE);
	new_root_item->otime.sec = cpu_to_le64(cur_time.tv_sec);
	new_root_item->otime.nsec = cpu_to_le64(cur_time.tv_nsec);
	btrfs_set_root_otransid(new_root_item, trans->transid);
	memset(&new_root_item->stime, 0, sizeof(new_root_item->stime));
	memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime));
	btrfs_set_root_stransid(new_root_item, 0);
	btrfs_set_root_rtransid(new_root_item, 0);

1071
	old = btrfs_lock_root_node(root);
1072
	ret = btrfs_cow_block(trans, root, old, NULL, 0, &old);
1073 1074 1075 1076 1077
	if (ret) {
		btrfs_tree_unlock(old);
		free_extent_buffer(old);
		goto abort_trans_dput;
	}
1078

1079 1080
	btrfs_set_lock_blocking(old);

1081
	ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
1082
	/* clean up in any case */
1083 1084
	btrfs_tree_unlock(old);
	free_extent_buffer(old);
1085 1086
	if (ret)
		goto abort_trans_dput;
1087

1088 1089 1090 1091
	/* see comments in should_cow_block() */
	root->force_cow = 1;
	smp_wmb();

1092
	btrfs_set_root_node(new_root_item, tmp);
1093 1094 1095
	/* record when the snapshot was created in key.offset */
	key.offset = trans->transid;
	ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
1096 1097
	btrfs_tree_unlock(tmp);
	free_extent_buffer(tmp);
1098
	if (ret)
1099
		goto abort_trans_dput;
1100

1101 1102 1103 1104
	/*
	 * insert root back/forward references
	 */
	ret = btrfs_add_root_ref(trans, tree_root, objectid,
1105
				 parent_root->root_key.objectid,
L
Li Zefan 已提交
1106
				 btrfs_ino(parent_inode), index,
1107
				 dentry->d_name.name, dentry->d_name.len);
1108
	dput(parent);
1109 1110
	if (ret)
		goto fail;
1111

1112 1113
	key.offset = (u64)-1;
	pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
1114 1115
	if (IS_ERR(pending->snap)) {
		ret = PTR_ERR(pending->snap);
1116
		goto abort_trans;
1117
	}
1118

1119 1120 1121 1122
	ret = btrfs_reloc_post_snapshot(trans, pending);
	if (ret)
		goto abort_trans;
	ret = 0;
1123
fail:
1124
	kfree(new_root_item);
L
Liu Bo 已提交
1125
	trans->block_rsv = rsv;
1126
	btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
1127 1128
	return ret;

1129 1130
abort_trans_dput:
	dput(parent);
1131 1132 1133
abort_trans:
	btrfs_abort_transaction(trans, root, ret);
	goto fail;
1134 1135
}

C
Chris Mason 已提交
1136 1137 1138
/*
 * create all the snapshots we've scheduled for creation
 */
1139 1140
static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
					     struct btrfs_fs_info *fs_info)
1141 1142 1143 1144
{
	struct btrfs_pending_snapshot *pending;
	struct list_head *head = &trans->transaction->pending_snapshots;

1145 1146
	list_for_each_entry(pending, head, list)
		create_pending_snapshot(trans, fs_info, pending);
1147 1148 1149
	return 0;
}

1150 1151 1152 1153 1154
static void update_super_roots(struct btrfs_root *root)
{
	struct btrfs_root_item *root_item;
	struct btrfs_super_block *super;

1155
	super = root->fs_info->super_copy;
1156 1157 1158 1159 1160 1161 1162 1163 1164 1165

	root_item = &root->fs_info->chunk_root->root_item;
	super->chunk_root = root_item->bytenr;
	super->chunk_root_generation = root_item->generation;
	super->chunk_root_level = root_item->level;

	root_item = &root->fs_info->tree_root->root_item;
	super->root = root_item->bytenr;
	super->generation = root_item->generation;
	super->root_level = root_item->level;
1166
	if (btrfs_test_opt(root, SPACE_CACHE))
1167
		super->cache_generation = root_item->generation;
1168 1169
}

1170 1171 1172
int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
{
	int ret = 0;
J
Josef Bacik 已提交
1173
	spin_lock(&info->trans_lock);
1174 1175
	if (info->running_transaction)
		ret = info->running_transaction->in_commit;
J
Josef Bacik 已提交
1176
	spin_unlock(&info->trans_lock);
1177 1178 1179
	return ret;
}

1180 1181 1182
int btrfs_transaction_blocked(struct btrfs_fs_info *info)
{
	int ret = 0;
J
Josef Bacik 已提交
1183
	spin_lock(&info->trans_lock);
1184 1185
	if (info->running_transaction)
		ret = info->running_transaction->blocked;
J
Josef Bacik 已提交
1186
	spin_unlock(&info->trans_lock);
1187 1188 1189
	return ret;
}

S
Sage Weil 已提交
1190 1191 1192 1193 1194 1195 1196
/*
 * wait for the current transaction commit to start and block subsequent
 * transaction joins
 */
static void wait_current_trans_commit_start(struct btrfs_root *root,
					    struct btrfs_transaction *trans)
{
L
Li Zefan 已提交
1197
	wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit);
S
Sage Weil 已提交
1198 1199 1200 1201 1202 1203 1204 1205 1206
}

/*
 * wait for the current transaction to start and then become unblocked.
 * caller holds ref.
 */
static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
					 struct btrfs_transaction *trans)
{
L
Li Zefan 已提交
1207 1208
	wait_event(root->fs_info->transaction_wait,
		   trans->commit_done || (trans->in_commit && !trans->blocked));
S
Sage Weil 已提交
1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237
}

/*
 * commit transactions asynchronously. once btrfs_commit_transaction_async
 * returns, any subsequent transaction will not be allowed to join.
 */
struct btrfs_async_commit {
	struct btrfs_trans_handle *newtrans;
	struct btrfs_root *root;
	struct delayed_work work;
};

static void do_async_commit(struct work_struct *work)
{
	struct btrfs_async_commit *ac =
		container_of(work, struct btrfs_async_commit, work.work);

	btrfs_commit_transaction(ac->newtrans, ac->root);
	kfree(ac);
}

int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root,
				   int wait_for_unblock)
{
	struct btrfs_async_commit *ac;
	struct btrfs_transaction *cur_trans;

	ac = kmalloc(sizeof(*ac), GFP_NOFS);
T
Tsutomu Itoh 已提交
1238 1239
	if (!ac)
		return -ENOMEM;
S
Sage Weil 已提交
1240 1241 1242

	INIT_DELAYED_WORK(&ac->work, do_async_commit);
	ac->root = root;
1243
	ac->newtrans = btrfs_join_transaction(root);
1244 1245 1246 1247 1248
	if (IS_ERR(ac->newtrans)) {
		int err = PTR_ERR(ac->newtrans);
		kfree(ac);
		return err;
	}
S
Sage Weil 已提交
1249 1250 1251

	/* take transaction reference */
	cur_trans = trans->transaction;
1252
	atomic_inc(&cur_trans->use_count);
S
Sage Weil 已提交
1253 1254 1255 1256 1257 1258 1259 1260 1261 1262

	btrfs_end_transaction(trans, root);
	schedule_delayed_work(&ac->work, 0);

	/* wait for transaction to start and unblock */
	if (wait_for_unblock)
		wait_current_trans_commit_start_and_unblock(root, cur_trans);
	else
		wait_current_trans_commit_start(root, cur_trans);

1263 1264 1265 1266
	if (current->journal_info == trans)
		current->journal_info = NULL;

	put_transaction(cur_trans);
S
Sage Weil 已提交
1267 1268 1269
	return 0;
}

1270 1271

static void cleanup_transaction(struct btrfs_trans_handle *trans,
1272
				struct btrfs_root *root, int err)
1273 1274 1275 1276 1277
{
	struct btrfs_transaction *cur_trans = trans->transaction;

	WARN_ON(trans->use_count > 1);

1278 1279
	btrfs_abort_transaction(trans, root, err);

1280 1281
	spin_lock(&root->fs_info->trans_lock);
	list_del_init(&cur_trans->list);
1282 1283 1284 1285
	if (cur_trans == root->fs_info->running_transaction) {
		root->fs_info->running_transaction = NULL;
		root->fs_info->trans_no_join = 0;
	}
1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302
	spin_unlock(&root->fs_info->trans_lock);

	btrfs_cleanup_one_transaction(trans->transaction, root);

	put_transaction(cur_trans);
	put_transaction(cur_trans);

	trace_btrfs_transaction_commit(root);

	btrfs_scrub_continue(root);

	if (current->journal_info == trans)
		current->journal_info = NULL;

	kmem_cache_free(btrfs_trans_handle_cachep, trans);
}

S
Sage Weil 已提交
1303 1304 1305 1306 1307 1308 1309
/*
 * btrfs_transaction state sequence:
 *    in_commit = 0, blocked = 0  (initial)
 *    in_commit = 1, blocked = 1
 *    blocked = 0
 *    commit_done = 1
 */
C
Chris Mason 已提交
1310 1311 1312
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
			     struct btrfs_root *root)
{
1313
	unsigned long joined = 0;
1314
	struct btrfs_transaction *cur_trans = trans->transaction;
C
Chris Mason 已提交
1315
	struct btrfs_transaction *prev_trans = NULL;
C
Chris Mason 已提交
1316
	DEFINE_WAIT(wait);
1317
	int ret = -EIO;
1318 1319
	int should_grow = 0;
	unsigned long now = get_seconds();
1320
	int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
C
Chris Mason 已提交
1321

1322 1323
	btrfs_run_ordered_operations(root, 0);

1324 1325 1326
	if (cur_trans->aborted)
		goto cleanup_transaction;

1327 1328 1329 1330
	/* make a pass through all the delayed refs we have so far
	 * any runnings procs may add more while we are here
	 */
	ret = btrfs_run_delayed_refs(trans, root, 0);
1331 1332
	if (ret)
		goto cleanup_transaction;
1333

1334 1335 1336
	btrfs_trans_release_metadata(trans, root);
	trans->block_rsv = NULL;

1337
	cur_trans = trans->transaction;
1338

1339 1340 1341 1342
	/*
	 * set the flushing flag so procs in this transaction have to
	 * start sending their work down.
	 */
1343
	cur_trans->delayed_refs.flushing = 1;
1344

1345
	ret = btrfs_run_delayed_refs(trans, root, 0);
1346 1347
	if (ret)
		goto cleanup_transaction;
1348

J
Josef Bacik 已提交
1349
	spin_lock(&cur_trans->commit_lock);
1350
	if (cur_trans->in_commit) {
J
Josef Bacik 已提交
1351
		spin_unlock(&cur_trans->commit_lock);
1352
		atomic_inc(&cur_trans->use_count);
1353
		ret = btrfs_end_transaction(trans, root);
C
Chris Mason 已提交
1354

1355
		wait_for_commit(root, cur_trans);
1356

C
Chris Mason 已提交
1357
		put_transaction(cur_trans);
1358

1359
		return ret;
C
Chris Mason 已提交
1360
	}
1361

C
Chris Mason 已提交
1362
	trans->transaction->in_commit = 1;
1363
	trans->transaction->blocked = 1;
J
Josef Bacik 已提交
1364
	spin_unlock(&cur_trans->commit_lock);
S
Sage Weil 已提交
1365 1366
	wake_up(&root->fs_info->transaction_blocked_wait);

J
Josef Bacik 已提交
1367
	spin_lock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1368 1369 1370 1371
	if (cur_trans->list.prev != &root->fs_info->trans_list) {
		prev_trans = list_entry(cur_trans->list.prev,
					struct btrfs_transaction, list);
		if (!prev_trans->commit_done) {
1372
			atomic_inc(&prev_trans->use_count);
J
Josef Bacik 已提交
1373
			spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1374 1375 1376

			wait_for_commit(root, prev_trans);

1377
			put_transaction(prev_trans);
J
Josef Bacik 已提交
1378 1379
		} else {
			spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1380
		}
J
Josef Bacik 已提交
1381 1382
	} else {
		spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1383
	}
1384

1385 1386
	if (!btrfs_test_opt(root, SSD) &&
	    (now < cur_trans->start_time || now - cur_trans->start_time < 1))
1387 1388
		should_grow = 1;

1389
	do {
1390
		int snap_pending = 0;
J
Josef Bacik 已提交
1391

1392
		joined = cur_trans->num_joined;
1393 1394 1395
		if (!list_empty(&trans->transaction->pending_snapshots))
			snap_pending = 1;

C
Chris Mason 已提交
1396
		WARN_ON(cur_trans != trans->transaction);
1397

1398
		if (flush_on_commit || snap_pending) {
Y
Yan, Zheng 已提交
1399
			btrfs_start_delalloc_inodes(root, 1);
1400
			btrfs_wait_ordered_extents(root, 0, 1);
1401 1402
		}

1403
		ret = btrfs_run_delayed_items(trans, root);
1404 1405
		if (ret)
			goto cleanup_transaction;
1406

1407 1408 1409 1410 1411 1412 1413
		/*
		 * running the delayed items may have added new refs. account
		 * them now so that they hinder processing of more delayed refs
		 * as little as possible.
		 */
		btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);

1414 1415 1416 1417 1418 1419 1420 1421 1422
		/*
		 * rename don't use btrfs_join_transaction, so, once we
		 * set the transaction to blocked above, we aren't going
		 * to get any new ordered operations.  We can safely run
		 * it here and no for sure that nothing new will be added
		 * to the list
		 */
		btrfs_run_ordered_operations(root, 1);

1423 1424 1425
		prepare_to_wait(&cur_trans->writer_wait, &wait,
				TASK_UNINTERRUPTIBLE);

1426
		if (atomic_read(&cur_trans->num_writers) > 1)
1427 1428 1429
			schedule_timeout(MAX_SCHEDULE_TIMEOUT);
		else if (should_grow)
			schedule_timeout(1);
1430 1431

		finish_wait(&cur_trans->writer_wait, &wait);
1432
	} while (atomic_read(&cur_trans->num_writers) > 1 ||
1433
		 (should_grow && cur_trans->num_joined != joined));
1434

1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445
	/*
	 * Ok now we need to make sure to block out any other joins while we
	 * commit the transaction.  We could have started a join before setting
	 * no_join so make sure to wait for num_writers to == 1 again.
	 */
	spin_lock(&root->fs_info->trans_lock);
	root->fs_info->trans_no_join = 1;
	spin_unlock(&root->fs_info->trans_lock);
	wait_event(cur_trans->writer_wait,
		   atomic_read(&cur_trans->num_writers) == 1);

C
Chris Mason 已提交
1446 1447 1448 1449 1450 1451 1452
	/*
	 * the reloc mutex makes sure that we stop
	 * the balancing code from coming in and moving
	 * extents around in the middle of the commit
	 */
	mutex_lock(&root->fs_info->reloc_mutex);

1453
	ret = btrfs_run_delayed_items(trans, root);
1454 1455 1456 1457
	if (ret) {
		mutex_unlock(&root->fs_info->reloc_mutex);
		goto cleanup_transaction;
	}
1458

1459
	ret = create_pending_snapshots(trans, root->fs_info);
1460 1461 1462 1463
	if (ret) {
		mutex_unlock(&root->fs_info->reloc_mutex);
		goto cleanup_transaction;
	}
1464

1465
	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1466 1467 1468 1469
	if (ret) {
		mutex_unlock(&root->fs_info->reloc_mutex);
		goto cleanup_transaction;
	}
1470

1471 1472 1473 1474 1475 1476
	/*
	 * make sure none of the code above managed to slip in a
	 * delayed item
	 */
	btrfs_assert_delayed_root_empty(root);

C
Chris Mason 已提交
1477
	WARN_ON(cur_trans != trans->transaction);
C
Chris Mason 已提交
1478

A
Arne Jansen 已提交
1479
	btrfs_scrub_pause(root);
1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494
	/* btrfs_commit_tree_roots is responsible for getting the
	 * various roots consistent with each other.  Every pointer
	 * in the tree of tree roots has to point to the most up to date
	 * root for every subvolume and other tree.  So, we have to keep
	 * the tree logging code from jumping in and changing any
	 * of the trees.
	 *
	 * At this point in the commit, there can't be any tree-log
	 * writers, but a little lower down we drop the trans mutex
	 * and let new people in.  By holding the tree_log_mutex
	 * from now until after the super is written, we avoid races
	 * with the tree-log code.
	 */
	mutex_lock(&root->fs_info->tree_log_mutex);

1495
	ret = commit_fs_roots(trans, root);
1496 1497
	if (ret) {
		mutex_unlock(&root->fs_info->tree_log_mutex);
1498
		mutex_unlock(&root->fs_info->reloc_mutex);
1499 1500
		goto cleanup_transaction;
	}
1501

1502
	/* commit_fs_roots gets rid of all the tree log roots, it is now
1503 1504 1505 1506
	 * safe to free the root of tree log roots
	 */
	btrfs_free_log_root_tree(trans, root->fs_info);

1507
	ret = commit_cowonly_roots(trans, root);
1508 1509
	if (ret) {
		mutex_unlock(&root->fs_info->tree_log_mutex);
1510
		mutex_unlock(&root->fs_info->reloc_mutex);
1511 1512
		goto cleanup_transaction;
	}
1513

1514 1515
	btrfs_prepare_extent_commit(trans, root);

C
Chris Mason 已提交
1516
	cur_trans = root->fs_info->running_transaction;
1517 1518 1519

	btrfs_set_root_node(&root->fs_info->tree_root->root_item,
			    root->fs_info->tree_root->node);
J
Josef Bacik 已提交
1520
	switch_commit_root(root->fs_info->tree_root);
1521 1522 1523

	btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
			    root->fs_info->chunk_root->node);
J
Josef Bacik 已提交
1524
	switch_commit_root(root->fs_info->chunk_root);
1525

1526
	assert_qgroups_uptodate(trans);
1527
	update_super_roots(root);
1528 1529

	if (!root->fs_info->log_root_recovering) {
1530 1531
		btrfs_set_super_log_root(root->fs_info->super_copy, 0);
		btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
1532 1533
	}

1534 1535
	memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
	       sizeof(*root->fs_info->super_copy));
C
Chris Mason 已提交
1536

1537
	trans->transaction->blocked = 0;
J
Josef Bacik 已提交
1538 1539 1540 1541
	spin_lock(&root->fs_info->trans_lock);
	root->fs_info->running_transaction = NULL;
	root->fs_info->trans_no_join = 0;
	spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1542
	mutex_unlock(&root->fs_info->reloc_mutex);
1543

1544
	wake_up(&root->fs_info->transaction_wait);
1545

C
Chris Mason 已提交
1546
	ret = btrfs_write_and_wait_transaction(trans, root);
1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558
	if (ret) {
		btrfs_error(root->fs_info, ret,
			    "Error while writing out transaction.");
		mutex_unlock(&root->fs_info->tree_log_mutex);
		goto cleanup_transaction;
	}

	ret = write_ctree_super(trans, root, 0);
	if (ret) {
		mutex_unlock(&root->fs_info->tree_log_mutex);
		goto cleanup_transaction;
	}
1559

1560 1561 1562 1563 1564 1565
	/*
	 * the super is written, we can safely allow the tree-loggers
	 * to go about their business
	 */
	mutex_unlock(&root->fs_info->tree_log_mutex);

1566
	btrfs_finish_extent_commit(trans, root);
1567

C
Chris Mason 已提交
1568
	cur_trans->commit_done = 1;
1569

1570
	root->fs_info->last_trans_committed = cur_trans->transid;
J
Josef Bacik 已提交
1571

C
Chris Mason 已提交
1572
	wake_up(&cur_trans->commit_wait);
1573

J
Josef Bacik 已提交
1574
	spin_lock(&root->fs_info->trans_lock);
1575
	list_del_init(&cur_trans->list);
J
Josef Bacik 已提交
1576 1577
	spin_unlock(&root->fs_info->trans_lock);

C
Chris Mason 已提交
1578
	put_transaction(cur_trans);
C
Chris Mason 已提交
1579
	put_transaction(cur_trans);
1580

1581 1582
	trace_btrfs_transaction_commit(root);

A
Arne Jansen 已提交
1583 1584
	btrfs_scrub_continue(root);

J
Josef Bacik 已提交
1585 1586 1587
	if (current->journal_info == trans)
		current->journal_info = NULL;

C
Chris Mason 已提交
1588
	kmem_cache_free(btrfs_trans_handle_cachep, trans);
Y
Yan, Zheng 已提交
1589 1590 1591 1592

	if (current != root->fs_info->transaction_kthread)
		btrfs_run_delayed_iputs(root);

C
Chris Mason 已提交
1593
	return ret;
1594 1595

cleanup_transaction:
1596 1597
	btrfs_trans_release_metadata(trans, root);
	trans->block_rsv = NULL;
1598 1599 1600 1601
	btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n");
//	WARN_ON(1);
	if (current->journal_info == trans)
		current->journal_info = NULL;
1602
	cleanup_transaction(trans, root, ret);
1603 1604

	return ret;
C
Chris Mason 已提交
1605 1606
}

C
Chris Mason 已提交
1607 1608 1609
/*
 * interface function to delete all the snapshots we have scheduled for deletion
 */
1610 1611
int btrfs_clean_old_snapshots(struct btrfs_root *root)
{
1612 1613 1614
	LIST_HEAD(list);
	struct btrfs_fs_info *fs_info = root->fs_info;

J
Josef Bacik 已提交
1615
	spin_lock(&fs_info->trans_lock);
1616
	list_splice_init(&fs_info->dead_roots, &list);
J
Josef Bacik 已提交
1617
	spin_unlock(&fs_info->trans_lock);
1618

1619
	while (!list_empty(&list)) {
1620 1621
		int ret;

1622
		root = list_entry(list.next, struct btrfs_root, root_list);
1623 1624
		list_del(&root->root_list);

1625 1626
		btrfs_kill_all_delayed_nodes(root);

1627 1628
		if (btrfs_header_backref_rev(root->node) <
		    BTRFS_MIXED_BACKREF_REV)
1629
			ret = btrfs_drop_snapshot(root, NULL, 0, 0);
1630
		else
1631 1632
			ret =btrfs_drop_snapshot(root, NULL, 1, 0);
		BUG_ON(ret < 0);
1633 1634 1635
	}
	return 0;
}