transaction.c 36.5 KB
Newer Older
C
Chris Mason 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

C
Chris Mason 已提交
19
#include <linux/fs.h>
20
#include <linux/slab.h>
C
Chris Mason 已提交
21
#include <linux/sched.h>
22
#include <linux/writeback.h>
23
#include <linux/pagemap.h>
24
#include <linux/blkdev.h>
C
Chris Mason 已提交
25 26 27
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
28
#include "locking.h"
29
#include "tree-log.h"
30
#include "inode-map.h"
C
Chris Mason 已提交
31

32 33
#define BTRFS_ROOT_TRANS_TAG 0

34
static noinline void put_transaction(struct btrfs_transaction *transaction)
C
Chris Mason 已提交
35
{
36 37
	WARN_ON(atomic_read(&transaction->use_count) == 0);
	if (atomic_dec_and_test(&transaction->use_count)) {
J
Josef Bacik 已提交
38
		BUG_ON(!list_empty(&transaction->list));
39 40
		WARN_ON(transaction->delayed_refs.root.rb_node);
		WARN_ON(!list_empty(&transaction->delayed_refs.seq_head));
C
Chris Mason 已提交
41 42
		memset(transaction, 0, sizeof(*transaction));
		kmem_cache_free(btrfs_transaction_cachep, transaction);
C
Chris Mason 已提交
43
	}
C
Chris Mason 已提交
44 45
}

J
Josef Bacik 已提交
46 47 48 49 50 51
static noinline void switch_commit_root(struct btrfs_root *root)
{
	free_extent_buffer(root->commit_root);
	root->commit_root = btrfs_root_node(root);
}

C
Chris Mason 已提交
52 53 54
/*
 * either allocate a new transaction or hop into the existing one
 */
J
Josef Bacik 已提交
55
static noinline int join_transaction(struct btrfs_root *root, int nofail)
C
Chris Mason 已提交
56 57
{
	struct btrfs_transaction *cur_trans;
J
Josef Bacik 已提交
58 59

	spin_lock(&root->fs_info->trans_lock);
60
loop:
J
Josef Bacik 已提交
61 62 63 64 65 66 67
	if (root->fs_info->trans_no_join) {
		if (!nofail) {
			spin_unlock(&root->fs_info->trans_lock);
			return -EBUSY;
		}
	}

C
Chris Mason 已提交
68
	cur_trans = root->fs_info->running_transaction;
J
Josef Bacik 已提交
69 70
	if (cur_trans) {
		atomic_inc(&cur_trans->use_count);
71
		atomic_inc(&cur_trans->num_writers);
72
		cur_trans->num_joined++;
J
Josef Bacik 已提交
73 74
		spin_unlock(&root->fs_info->trans_lock);
		return 0;
C
Chris Mason 已提交
75
	}
J
Josef Bacik 已提交
76 77 78 79 80
	spin_unlock(&root->fs_info->trans_lock);

	cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
	if (!cur_trans)
		return -ENOMEM;
81

J
Josef Bacik 已提交
82 83
	spin_lock(&root->fs_info->trans_lock);
	if (root->fs_info->running_transaction) {
84 85 86 87
		/*
		 * someone started a transaction after we unlocked.  Make sure
		 * to redo the trans_no_join checks above
		 */
J
Josef Bacik 已提交
88 89
		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
		cur_trans = root->fs_info->running_transaction;
90
		goto loop;
C
Chris Mason 已提交
91
	}
92

J
Josef Bacik 已提交
93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
	atomic_set(&cur_trans->num_writers, 1);
	cur_trans->num_joined = 0;
	init_waitqueue_head(&cur_trans->writer_wait);
	init_waitqueue_head(&cur_trans->commit_wait);
	cur_trans->in_commit = 0;
	cur_trans->blocked = 0;
	/*
	 * One for this trans handle, one so it will live on until we
	 * commit the transaction.
	 */
	atomic_set(&cur_trans->use_count, 2);
	cur_trans->commit_done = 0;
	cur_trans->start_time = get_seconds();

	cur_trans->delayed_refs.root = RB_ROOT;
	cur_trans->delayed_refs.num_entries = 0;
	cur_trans->delayed_refs.num_heads_ready = 0;
	cur_trans->delayed_refs.num_heads = 0;
	cur_trans->delayed_refs.flushing = 0;
	cur_trans->delayed_refs.run_delayed_start = 0;
113
	cur_trans->delayed_refs.seq = 1;
J
Josef Bacik 已提交
114 115
	spin_lock_init(&cur_trans->commit_lock);
	spin_lock_init(&cur_trans->delayed_refs.lock);
116
	INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
J
Josef Bacik 已提交
117 118 119 120

	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
	list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
	extent_io_tree_init(&cur_trans->dirty_pages,
C
Chris Mason 已提交
121
			     root->fs_info->btree_inode->i_mapping);
J
Josef Bacik 已提交
122 123 124 125
	root->fs_info->generation++;
	cur_trans->transid = root->fs_info->generation;
	root->fs_info->running_transaction = cur_trans;
	spin_unlock(&root->fs_info->trans_lock);
126

C
Chris Mason 已提交
127 128 129
	return 0;
}

C
Chris Mason 已提交
130
/*
C
Chris Mason 已提交
131 132 133 134
 * this does all the record keeping required to make sure that a reference
 * counted root is properly recorded in a given transaction.  This is required
 * to make sure the old root from before we joined the transaction is deleted
 * when the transaction commits
C
Chris Mason 已提交
135
 */
C
Chris Mason 已提交
136
static int record_root_in_trans(struct btrfs_trans_handle *trans,
J
Josef Bacik 已提交
137
			       struct btrfs_root *root)
138
{
139
	if (root->ref_cows && root->last_trans < trans->transid) {
140
		WARN_ON(root == root->fs_info->extent_root);
141 142
		WARN_ON(root->commit_root != root->node);

C
Chris Mason 已提交
143 144 145 146 147 148 149 150 151 152 153 154
		/*
		 * see below for in_trans_setup usage rules
		 * we have the reloc mutex held now, so there
		 * is only one writer in this function
		 */
		root->in_trans_setup = 1;

		/* make sure readers find in_trans_setup before
		 * they find our root->last_trans update
		 */
		smp_wmb();

J
Josef Bacik 已提交
155 156 157 158 159
		spin_lock(&root->fs_info->fs_roots_radix_lock);
		if (root->last_trans == trans->transid) {
			spin_unlock(&root->fs_info->fs_roots_radix_lock);
			return 0;
		}
160 161 162
		radix_tree_tag_set(&root->fs_info->fs_roots_radix,
			   (unsigned long)root->root_key.objectid,
			   BTRFS_ROOT_TRANS_TAG);
J
Josef Bacik 已提交
163
		spin_unlock(&root->fs_info->fs_roots_radix_lock);
C
Chris Mason 已提交
164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
		root->last_trans = trans->transid;

		/* this is pretty tricky.  We don't want to
		 * take the relocation lock in btrfs_record_root_in_trans
		 * unless we're really doing the first setup for this root in
		 * this transaction.
		 *
		 * Normally we'd use root->last_trans as a flag to decide
		 * if we want to take the expensive mutex.
		 *
		 * But, we have to set root->last_trans before we
		 * init the relocation root, otherwise, we trip over warnings
		 * in ctree.c.  The solution used here is to flag ourselves
		 * with root->in_trans_setup.  When this is 1, we're still
		 * fixing up the reloc trees and everyone must wait.
		 *
		 * When this is zero, they can trust root->last_trans and fly
		 * through btrfs_record_root_in_trans without having to take the
		 * lock.  smp_wmb() makes sure that all the writes above are
		 * done before we pop in the zero below
		 */
185
		btrfs_init_reloc_root(trans, root);
C
Chris Mason 已提交
186 187
		smp_wmb();
		root->in_trans_setup = 0;
188 189 190
	}
	return 0;
}
191

C
Chris Mason 已提交
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214

int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root)
{
	if (!root->ref_cows)
		return 0;

	/*
	 * see record_root_in_trans for comments about in_trans_setup usage
	 * and barriers
	 */
	smp_rmb();
	if (root->last_trans == trans->transid &&
	    !root->in_trans_setup)
		return 0;

	mutex_lock(&root->fs_info->reloc_mutex);
	record_root_in_trans(trans, root);
	mutex_unlock(&root->fs_info->reloc_mutex);

	return 0;
}

C
Chris Mason 已提交
215 216 217 218
/* wait for commit against the current transaction to become unblocked
 * when this is done, it is safe to start a new transaction, but the current
 * transaction might not be fully on disk.
 */
C
Chris Mason 已提交
219
static void wait_current_trans(struct btrfs_root *root)
C
Chris Mason 已提交
220
{
221
	struct btrfs_transaction *cur_trans;
C
Chris Mason 已提交
222

J
Josef Bacik 已提交
223
	spin_lock(&root->fs_info->trans_lock);
224
	cur_trans = root->fs_info->running_transaction;
C
Chris Mason 已提交
225
	if (cur_trans && cur_trans->blocked) {
226
		atomic_inc(&cur_trans->use_count);
J
Josef Bacik 已提交
227
		spin_unlock(&root->fs_info->trans_lock);
L
Li Zefan 已提交
228 229 230

		wait_event(root->fs_info->transaction_wait,
			   !cur_trans->blocked);
231
		put_transaction(cur_trans);
J
Josef Bacik 已提交
232 233
	} else {
		spin_unlock(&root->fs_info->trans_lock);
234
	}
C
Chris Mason 已提交
235 236
}

237 238 239 240
enum btrfs_trans_type {
	TRANS_START,
	TRANS_JOIN,
	TRANS_USERSPACE,
241
	TRANS_JOIN_NOLOCK,
242 243
};

244 245
static int may_wait_transaction(struct btrfs_root *root, int type)
{
J
Josef Bacik 已提交
246 247 248 249 250 251 252 253
	if (root->fs_info->log_root_recovering)
		return 0;

	if (type == TRANS_USERSPACE)
		return 1;

	if (type == TRANS_START &&
	    !atomic_read(&root->fs_info->open_ioctl_trans))
254
		return 1;
J
Josef Bacik 已提交
255

256 257 258
	return 0;
}

259
static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
260
						    u64 num_items, int type)
C
Chris Mason 已提交
261
{
262 263
	struct btrfs_trans_handle *h;
	struct btrfs_transaction *cur_trans;
264
	u64 num_bytes = 0;
C
Chris Mason 已提交
265
	int ret;
L
liubo 已提交
266 267 268

	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
		return ERR_PTR(-EROFS);
269 270 271 272 273 274 275 276 277

	if (current->journal_info) {
		WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
		h = current->journal_info;
		h->use_count++;
		h->orig_rsv = h->block_rsv;
		h->block_rsv = NULL;
		goto got_it;
	}
278 279 280 281 282 283 284

	/*
	 * Do the reservation before we join the transaction so we can do all
	 * the appropriate flushing if need be.
	 */
	if (num_items > 0 && root != root->fs_info->chunk_root) {
		num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
285
		ret = btrfs_block_rsv_add(root,
286 287 288 289 290
					  &root->fs_info->trans_block_rsv,
					  num_bytes);
		if (ret)
			return ERR_PTR(ret);
	}
291 292 293 294
again:
	h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
	if (!h)
		return ERR_PTR(-ENOMEM);
C
Chris Mason 已提交
295

296
	if (may_wait_transaction(root, type))
C
Chris Mason 已提交
297
		wait_current_trans(root);
298

J
Josef Bacik 已提交
299 300 301 302 303 304
	do {
		ret = join_transaction(root, type == TRANS_JOIN_NOLOCK);
		if (ret == -EBUSY)
			wait_current_trans(root);
	} while (ret == -EBUSY);

T
Tsutomu Itoh 已提交
305
	if (ret < 0) {
306
		kmem_cache_free(btrfs_trans_handle_cachep, h);
T
Tsutomu Itoh 已提交
307 308
		return ERR_PTR(ret);
	}
309

310 311 312 313
	cur_trans = root->fs_info->running_transaction;

	h->transid = cur_trans->transid;
	h->transaction = cur_trans;
C
Chris Mason 已提交
314
	h->blocks_used = 0;
315
	h->bytes_reserved = 0;
316
	h->delayed_ref_updates = 0;
317
	h->use_count = 1;
318
	h->block_rsv = NULL;
319
	h->orig_rsv = NULL;
320

321 322 323 324 325 326
	smp_mb();
	if (cur_trans->blocked && may_wait_transaction(root, type)) {
		btrfs_commit_transaction(h, root);
		goto again;
	}

327 328 329
	if (num_bytes) {
		h->block_rsv = &root->fs_info->trans_block_rsv;
		h->bytes_reserved = num_bytes;
330
	}
J
Josef Bacik 已提交
331

332
got_it:
J
Josef Bacik 已提交
333
	btrfs_record_root_in_trans(h, root);
334 335 336

	if (!current->journal_info && type != TRANS_USERSPACE)
		current->journal_info = h;
C
Chris Mason 已提交
337 338 339
	return h;
}

340
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
341
						   int num_items)
342
{
343
	return start_transaction(root, num_items, TRANS_START);
344
}
345
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
346
{
347
	return start_transaction(root, 0, TRANS_JOIN);
348 349
}

350
struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
351 352 353 354
{
	return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
}

355
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
356
{
357
	return start_transaction(root, 0, TRANS_USERSPACE);
358 359
}

C
Chris Mason 已提交
360
/* wait for a transaction commit to be fully complete */
361
static noinline void wait_for_commit(struct btrfs_root *root,
362 363
				    struct btrfs_transaction *commit)
{
L
Li Zefan 已提交
364
	wait_event(commit->commit_wait, commit->commit_done);
365 366
}

367 368 369 370 371 372 373 374
int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
{
	struct btrfs_transaction *cur_trans = NULL, *t;
	int ret;

	ret = 0;
	if (transid) {
		if (transid <= root->fs_info->last_trans_committed)
J
Josef Bacik 已提交
375
			goto out;
376 377

		/* find specified transaction */
J
Josef Bacik 已提交
378
		spin_lock(&root->fs_info->trans_lock);
379 380 381
		list_for_each_entry(t, &root->fs_info->trans_list, list) {
			if (t->transid == transid) {
				cur_trans = t;
J
Josef Bacik 已提交
382
				atomic_inc(&cur_trans->use_count);
383 384 385 386 387
				break;
			}
			if (t->transid > transid)
				break;
		}
J
Josef Bacik 已提交
388
		spin_unlock(&root->fs_info->trans_lock);
389 390
		ret = -EINVAL;
		if (!cur_trans)
J
Josef Bacik 已提交
391
			goto out;  /* bad transid */
392 393
	} else {
		/* find newest transaction that is committing | committed */
J
Josef Bacik 已提交
394
		spin_lock(&root->fs_info->trans_lock);
395 396 397 398
		list_for_each_entry_reverse(t, &root->fs_info->trans_list,
					    list) {
			if (t->in_commit) {
				if (t->commit_done)
399
					break;
400
				cur_trans = t;
J
Josef Bacik 已提交
401
				atomic_inc(&cur_trans->use_count);
402 403 404
				break;
			}
		}
J
Josef Bacik 已提交
405
		spin_unlock(&root->fs_info->trans_lock);
406
		if (!cur_trans)
J
Josef Bacik 已提交
407
			goto out;  /* nothing committing|committed */
408 409 410 411 412 413
	}

	wait_for_commit(root, cur_trans);

	put_transaction(cur_trans);
	ret = 0;
J
Josef Bacik 已提交
414
out:
415 416 417
	return ret;
}

C
Chris Mason 已提交
418 419
void btrfs_throttle(struct btrfs_root *root)
{
J
Josef Bacik 已提交
420
	if (!atomic_read(&root->fs_info->open_ioctl_trans))
421
		wait_current_trans(root);
C
Chris Mason 已提交
422 423
}

424 425 426 427
static int should_end_transaction(struct btrfs_trans_handle *trans,
				  struct btrfs_root *root)
{
	int ret;
428 429

	ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
430 431 432 433 434 435 436
	return ret ? 1 : 0;
}

int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root)
{
	struct btrfs_transaction *cur_trans = trans->transaction;
437
	struct btrfs_block_rsv *rsv = trans->block_rsv;
438 439
	int updates;

J
Josef Bacik 已提交
440
	smp_mb();
441 442 443
	if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
		return 1;

444 445 446 447 448 449
	/*
	 * We need to do this in case we're deleting csums so the global block
	 * rsv get's used instead of the csum block rsv.
	 */
	trans->block_rsv = NULL;

450 451 452 453 454
	updates = trans->delayed_ref_updates;
	trans->delayed_ref_updates = 0;
	if (updates)
		btrfs_run_delayed_refs(trans, root, updates);

455 456
	trans->block_rsv = rsv;

457 458 459
	return should_end_transaction(trans, root);
}

460
static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
461
			  struct btrfs_root *root, int throttle, int lock)
C
Chris Mason 已提交
462
{
463
	struct btrfs_transaction *cur_trans = trans->transaction;
464
	struct btrfs_fs_info *info = root->fs_info;
465 466
	int count = 0;

467 468 469 470 471
	if (--trans->use_count) {
		trans->block_rsv = trans->orig_rsv;
		return 0;
	}

472
	btrfs_trans_release_metadata(trans, root);
473
	trans->block_rsv = NULL;
474 475 476 477 478 479
	while (count < 4) {
		unsigned long cur = trans->delayed_ref_updates;
		trans->delayed_ref_updates = 0;
		if (cur &&
		    trans->transaction->delayed_refs.num_heads_ready > 64) {
			trans->delayed_ref_updates = 0;
480 481 482 483 484 485 486

			/*
			 * do a full flush if the transaction is trying
			 * to close
			 */
			if (trans->transaction->delayed_refs.flushing)
				cur = 0;
487 488 489 490 491
			btrfs_run_delayed_refs(trans, root, cur);
		} else {
			break;
		}
		count++;
492 493
	}

J
Josef Bacik 已提交
494 495
	if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
	    should_end_transaction(trans, root)) {
496
		trans->transaction->blocked = 1;
J
Josef Bacik 已提交
497 498
		smp_wmb();
	}
499

500
	if (lock && cur_trans->blocked && !cur_trans->in_commit) {
501 502 503 504 505 506 507
		if (throttle) {
			/*
			 * We may race with somebody else here so end up having
			 * to call end_transaction on ourselves again, so inc
			 * our use_count.
			 */
			trans->use_count++;
508
			return btrfs_commit_transaction(trans, root);
509
		} else {
510
			wake_up_process(info->transaction_kthread);
511
		}
512 513 514
	}

	WARN_ON(cur_trans != info->running_transaction);
515 516
	WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
	atomic_dec(&cur_trans->num_writers);
517

518
	smp_mb();
C
Chris Mason 已提交
519 520 521
	if (waitqueue_active(&cur_trans->writer_wait))
		wake_up(&cur_trans->writer_wait);
	put_transaction(cur_trans);
J
Josef Bacik 已提交
522 523 524

	if (current->journal_info == trans)
		current->journal_info = NULL;
C
Chris Mason 已提交
525
	memset(trans, 0, sizeof(*trans));
C
Chris Mason 已提交
526
	kmem_cache_free(btrfs_trans_handle_cachep, trans);
527

Y
Yan, Zheng 已提交
528 529 530
	if (throttle)
		btrfs_run_delayed_iputs(root);

C
Chris Mason 已提交
531 532 533
	return 0;
}

534 535 536
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root)
{
537 538 539 540 541 542
	int ret;

	ret = __btrfs_end_transaction(trans, root, 0, 1);
	if (ret)
		return ret;
	return 0;
543 544 545 546 547
}

int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root)
{
548 549 550 551 552 553
	int ret;

	ret = __btrfs_end_transaction(trans, root, 1, 1);
	if (ret)
		return ret;
	return 0;
554 555 556 557 558
}

int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root)
{
559 560 561 562 563 564 565 566 567 568 569 570
	int ret;

	ret = __btrfs_end_transaction(trans, root, 0, 0);
	if (ret)
		return ret;
	return 0;
}

int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
				struct btrfs_root *root)
{
	return __btrfs_end_transaction(trans, root, 1, 1);
571 572
}

C
Chris Mason 已提交
573 574 575
/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
576
 * those extents are sent to disk but does not wait on them
C
Chris Mason 已提交
577
 */
578
int btrfs_write_marked_extents(struct btrfs_root *root,
579
			       struct extent_io_tree *dirty_pages, int mark)
C
Chris Mason 已提交
580
{
581
	int err = 0;
582
	int werr = 0;
J
Josef Bacik 已提交
583
	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
584
	u64 start = 0;
585
	u64 end;
586

J
Josef Bacik 已提交
587 588 589 590 591 592 593 594 595
	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
				      mark)) {
		convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
				   GFP_NOFS);
		err = filemap_fdatawrite_range(mapping, start, end);
		if (err)
			werr = err;
		cond_resched();
		start = end + 1;
596
	}
597 598 599 600 601 602 603 604 605 606 607 608
	if (err)
		werr = err;
	return werr;
}

/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
 * those extents are on disk for transaction or log commit.  We wait
 * on all the pages and clear them from the dirty pages state tree
 */
int btrfs_wait_marked_extents(struct btrfs_root *root,
609
			      struct extent_io_tree *dirty_pages, int mark)
610 611 612
{
	int err = 0;
	int werr = 0;
J
Josef Bacik 已提交
613
	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
614 615
	u64 start = 0;
	u64 end;
616

J
Josef Bacik 已提交
617 618 619 620 621 622 623 624
	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
				      EXTENT_NEED_WAIT)) {
		clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
		err = filemap_fdatawait_range(mapping, start, end);
		if (err)
			werr = err;
		cond_resched();
		start = end + 1;
625
	}
626 627 628
	if (err)
		werr = err;
	return werr;
C
Chris Mason 已提交
629 630
}

631 632 633 634 635 636
/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
 * those extents are on disk for transaction or log commit
 */
int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
637
				struct extent_io_tree *dirty_pages, int mark)
638 639 640 641
{
	int ret;
	int ret2;

642 643
	ret = btrfs_write_marked_extents(root, dirty_pages, mark);
	ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
644 645 646 647 648 649

	if (ret)
		return ret;
	if (ret2)
		return ret2;
	return 0;
650 651
}

652 653 654 655 656 657 658 659 660
int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
				     struct btrfs_root *root)
{
	if (!trans || !trans->transaction) {
		struct inode *btree_inode;
		btree_inode = root->fs_info->btree_inode;
		return filemap_write_and_wait(btree_inode->i_mapping);
	}
	return btrfs_write_and_wait_marked_extents(root,
661 662
					   &trans->transaction->dirty_pages,
					   EXTENT_DIRTY);
663 664
}

C
Chris Mason 已提交
665 666 667 668 669 670 671 672 673 674
/*
 * this is used to update the root pointer in the tree of tree roots.
 *
 * But, in the case of the extent allocation tree, updating the root
 * pointer may allocate blocks which may change the root of the extent
 * allocation tree.
 *
 * So, this loops and repeats and makes sure the cowonly root didn't
 * change while the root pointer was being updated in the metadata.
 */
675 676
static int update_cowonly_root(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root)
C
Chris Mason 已提交
677 678
{
	int ret;
679
	u64 old_root_bytenr;
680
	u64 old_root_used;
681
	struct btrfs_root *tree_root = root->fs_info->tree_root;
C
Chris Mason 已提交
682

683
	old_root_used = btrfs_root_used(&root->root_item);
684
	btrfs_write_dirty_block_groups(trans, root);
685

C
Chris Mason 已提交
686
	while (1) {
687
		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
688 689
		if (old_root_bytenr == root->node->start &&
		    old_root_used == btrfs_root_used(&root->root_item))
C
Chris Mason 已提交
690
			break;
691

692
		btrfs_set_root_node(&root->root_item, root->node);
C
Chris Mason 已提交
693
		ret = btrfs_update_root(trans, tree_root,
694 695
					&root->root_key,
					&root->root_item);
C
Chris Mason 已提交
696
		BUG_ON(ret);
697

698
		old_root_used = btrfs_root_used(&root->root_item);
699
		ret = btrfs_write_dirty_block_groups(trans, root);
700
		BUG_ON(ret);
701
	}
702 703 704 705

	if (root != root->fs_info->extent_root)
		switch_commit_root(root);

706 707 708
	return 0;
}

C
Chris Mason 已提交
709 710 711
/*
 * update all the cowonly tree roots on disk
 */
712 713
static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
					 struct btrfs_root *root)
714 715 716
{
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct list_head *next;
717
	struct extent_buffer *eb;
718
	int ret;
719

720 721
	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
	BUG_ON(ret);
722

723
	eb = btrfs_lock_root_node(fs_info->tree_root);
724
	btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
725 726
	btrfs_tree_unlock(eb);
	free_extent_buffer(eb);
727

728 729
	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
	BUG_ON(ret);
730

C
Chris Mason 已提交
731
	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
732 733 734
		next = fs_info->dirty_cowonly_roots.next;
		list_del_init(next);
		root = list_entry(next, struct btrfs_root, dirty_list);
735

736
		update_cowonly_root(trans, root);
C
Chris Mason 已提交
737
	}
738 739 740 741 742

	down_write(&fs_info->extent_commit_sem);
	switch_commit_root(fs_info->extent_root);
	up_write(&fs_info->extent_commit_sem);

C
Chris Mason 已提交
743 744 745
	return 0;
}

C
Chris Mason 已提交
746 747 748 749 750
/*
 * dead roots are old snapshots that need to be deleted.  This allocates
 * a dirty root struct and adds it into the list of dead roots that need to
 * be deleted
 */
751
int btrfs_add_dead_root(struct btrfs_root *root)
752
{
J
Josef Bacik 已提交
753
	spin_lock(&root->fs_info->trans_lock);
754
	list_add(&root->root_list, &root->fs_info->dead_roots);
J
Josef Bacik 已提交
755
	spin_unlock(&root->fs_info->trans_lock);
756 757 758
	return 0;
}

C
Chris Mason 已提交
759
/*
760
 * update all the cowonly tree roots on disk
C
Chris Mason 已提交
761
 */
762 763
static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
				    struct btrfs_root *root)
764 765
{
	struct btrfs_root *gang[8];
766
	struct btrfs_fs_info *fs_info = root->fs_info;
767 768
	int i;
	int ret;
769 770
	int err = 0;

J
Josef Bacik 已提交
771
	spin_lock(&fs_info->fs_roots_radix_lock);
C
Chris Mason 已提交
772
	while (1) {
773 774
		ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
						 (void **)gang, 0,
775 776 777 778 779 780
						 ARRAY_SIZE(gang),
						 BTRFS_ROOT_TRANS_TAG);
		if (ret == 0)
			break;
		for (i = 0; i < ret; i++) {
			root = gang[i];
781 782 783
			radix_tree_tag_clear(&fs_info->fs_roots_radix,
					(unsigned long)root->root_key.objectid,
					BTRFS_ROOT_TRANS_TAG);
J
Josef Bacik 已提交
784
			spin_unlock(&fs_info->fs_roots_radix_lock);
Y
Yan Zheng 已提交
785

786
			btrfs_free_log(trans, root);
787
			btrfs_update_reloc_root(trans, root);
788
			btrfs_orphan_commit_root(trans, root);
789

790 791
			btrfs_save_ino_cache(root, trans);

792 793 794 795
			/* see comments in should_cow_block() */
			root->force_cow = 0;
			smp_wmb();

796
			if (root->commit_root != root->node) {
797
				mutex_lock(&root->fs_commit_mutex);
J
Josef Bacik 已提交
798
				switch_commit_root(root);
799 800 801
				btrfs_unpin_free_ino(root);
				mutex_unlock(&root->fs_commit_mutex);

802 803 804
				btrfs_set_root_node(&root->root_item,
						    root->node);
			}
805 806

			err = btrfs_update_root(trans, fs_info->tree_root,
807 808
						&root->root_key,
						&root->root_item);
J
Josef Bacik 已提交
809
			spin_lock(&fs_info->fs_roots_radix_lock);
810 811
			if (err)
				break;
812 813
		}
	}
J
Josef Bacik 已提交
814
	spin_unlock(&fs_info->fs_roots_radix_lock);
815
	return err;
816 817
}

C
Chris Mason 已提交
818 819 820 821
/*
 * defrag a given btree.  If cacheonly == 1, this won't read from the disk,
 * otherwise every leaf in the btree is read and defragged.
 */
822 823 824 825
int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
{
	struct btrfs_fs_info *info = root->fs_info;
	struct btrfs_trans_handle *trans;
826
	int ret;
827
	unsigned long nr;
828

829
	if (xchg(&root->defrag_running, 1))
830
		return 0;
831

832
	while (1) {
833 834 835 836
		trans = btrfs_start_transaction(root, 0);
		if (IS_ERR(trans))
			return PTR_ERR(trans);

837
		ret = btrfs_defrag_leaves(trans, root, cacheonly);
838

839
		nr = trans->blocks_used;
840
		btrfs_end_transaction(trans, root);
841
		btrfs_btree_balance_dirty(info->tree_root, nr);
842 843
		cond_resched();

844
		if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
845 846 847
			break;
	}
	root->defrag_running = 0;
848
	return ret;
849 850
}

C
Chris Mason 已提交
851 852 853 854
/*
 * new snapshots need to be created at a very specific time in the
 * transaction commit.  This does the actual creation
 */
855
static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
856 857 858 859
				   struct btrfs_fs_info *fs_info,
				   struct btrfs_pending_snapshot *pending)
{
	struct btrfs_key key;
860
	struct btrfs_root_item *new_root_item;
861 862
	struct btrfs_root *tree_root = fs_info->tree_root;
	struct btrfs_root *root = pending->root;
863
	struct btrfs_root *parent_root;
L
Liu Bo 已提交
864
	struct btrfs_block_rsv *rsv;
865
	struct inode *parent_inode;
866
	struct dentry *parent;
867
	struct dentry *dentry;
868
	struct extent_buffer *tmp;
869
	struct extent_buffer *old;
870
	int ret;
871
	u64 to_reserve = 0;
872
	u64 index = 0;
873
	u64 objectid;
L
Li Zefan 已提交
874
	u64 root_flags;
875

L
Liu Bo 已提交
876 877
	rsv = trans->block_rsv;

878 879
	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
	if (!new_root_item) {
880
		pending->error = -ENOMEM;
881 882
		goto fail;
	}
883

884
	ret = btrfs_find_free_objectid(tree_root, &objectid);
885 886
	if (ret) {
		pending->error = ret;
887
		goto fail;
888
	}
889

890
	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
891 892

	if (to_reserve > 0) {
893 894
		ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv,
						  to_reserve);
895 896 897 898 899 900
		if (ret) {
			pending->error = ret;
			goto fail;
		}
	}

901
	key.objectid = objectid;
902 903
	key.offset = (u64)-1;
	key.type = BTRFS_ROOT_ITEM_KEY;
904

905
	trans->block_rsv = &pending->block_rsv;
906

907
	dentry = pending->dentry;
908 909
	parent = dget_parent(dentry);
	parent_inode = parent->d_inode;
910
	parent_root = BTRFS_I(parent_inode)->root;
C
Chris Mason 已提交
911
	record_root_in_trans(trans, parent_root);
912

913 914 915
	/*
	 * insert the directory item
	 */
916
	ret = btrfs_set_inode_index(parent_inode, &index);
917
	BUG_ON(ret);
918
	ret = btrfs_insert_dir_item(trans, parent_root,
919
				dentry->d_name.name, dentry->d_name.len,
920
				parent_inode, &key,
921
				BTRFS_FT_DIR, index);
922
	BUG_ON(ret);
923

924 925
	btrfs_i_size_write(parent_inode, parent_inode->i_size +
					 dentry->d_name.len * 2);
926 927 928
	ret = btrfs_update_inode(trans, parent_root, parent_inode);
	BUG_ON(ret);

929 930 931 932 933 934 935 936 937
	/*
	 * pull in the delayed directory update
	 * and the delayed inode item
	 * otherwise we corrupt the FS during
	 * snapshot
	 */
	ret = btrfs_run_delayed_items(trans, root);
	BUG_ON(ret);

C
Chris Mason 已提交
938
	record_root_in_trans(trans, root);
939 940
	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
941
	btrfs_check_and_init_root_item(new_root_item);
942

L
Li Zefan 已提交
943 944 945 946 947 948 949
	root_flags = btrfs_root_flags(new_root_item);
	if (pending->readonly)
		root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
	else
		root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
	btrfs_set_root_flags(new_root_item, root_flags);

950 951 952 953 954 955 956 957
	old = btrfs_lock_root_node(root);
	btrfs_cow_block(trans, root, old, NULL, 0, &old);
	btrfs_set_lock_blocking(old);

	btrfs_copy_root(trans, root, old, &tmp, objectid);
	btrfs_tree_unlock(old);
	free_extent_buffer(old);

958 959 960 961
	/* see comments in should_cow_block() */
	root->force_cow = 1;
	smp_wmb();

962
	btrfs_set_root_node(new_root_item, tmp);
963 964 965
	/* record when the snapshot was created in key.offset */
	key.offset = trans->transid;
	ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
966 967
	btrfs_tree_unlock(tmp);
	free_extent_buffer(tmp);
968
	BUG_ON(ret);
969

970 971 972 973
	/*
	 * insert root back/forward references
	 */
	ret = btrfs_add_root_ref(trans, tree_root, objectid,
974
				 parent_root->root_key.objectid,
L
Li Zefan 已提交
975
				 btrfs_ino(parent_inode), index,
976
				 dentry->d_name.name, dentry->d_name.len);
977
	BUG_ON(ret);
978
	dput(parent);
979

980 981 982
	key.offset = (u64)-1;
	pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
	BUG_ON(IS_ERR(pending->snap));
983

984
	btrfs_reloc_post_snapshot(trans, pending);
985
fail:
986
	kfree(new_root_item);
L
Liu Bo 已提交
987
	trans->block_rsv = rsv;
988 989
	btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
	return 0;
990 991
}

C
Chris Mason 已提交
992 993 994
/*
 * create all the snapshots we've scheduled for creation
 */
995 996
static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
					     struct btrfs_fs_info *fs_info)
997 998 999 1000 1001
{
	struct btrfs_pending_snapshot *pending;
	struct list_head *head = &trans->transaction->pending_snapshots;
	int ret;

Q
Qinghuang Feng 已提交
1002
	list_for_each_entry(pending, head, list) {
1003 1004 1005 1006 1007 1008
		ret = create_pending_snapshot(trans, fs_info, pending);
		BUG_ON(ret);
	}
	return 0;
}

1009 1010 1011 1012 1013
static void update_super_roots(struct btrfs_root *root)
{
	struct btrfs_root_item *root_item;
	struct btrfs_super_block *super;

1014
	super = root->fs_info->super_copy;
1015 1016 1017 1018 1019 1020 1021 1022 1023 1024

	root_item = &root->fs_info->chunk_root->root_item;
	super->chunk_root = root_item->bytenr;
	super->chunk_root_generation = root_item->generation;
	super->chunk_root_level = root_item->level;

	root_item = &root->fs_info->tree_root->root_item;
	super->root = root_item->bytenr;
	super->generation = root_item->generation;
	super->root_level = root_item->level;
1025
	if (btrfs_test_opt(root, SPACE_CACHE))
1026
		super->cache_generation = root_item->generation;
1027 1028
}

1029 1030 1031
int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
{
	int ret = 0;
J
Josef Bacik 已提交
1032
	spin_lock(&info->trans_lock);
1033 1034
	if (info->running_transaction)
		ret = info->running_transaction->in_commit;
J
Josef Bacik 已提交
1035
	spin_unlock(&info->trans_lock);
1036 1037 1038
	return ret;
}

1039 1040 1041
int btrfs_transaction_blocked(struct btrfs_fs_info *info)
{
	int ret = 0;
J
Josef Bacik 已提交
1042
	spin_lock(&info->trans_lock);
1043 1044
	if (info->running_transaction)
		ret = info->running_transaction->blocked;
J
Josef Bacik 已提交
1045
	spin_unlock(&info->trans_lock);
1046 1047 1048
	return ret;
}

S
Sage Weil 已提交
1049 1050 1051 1052 1053 1054 1055
/*
 * wait for the current transaction commit to start and block subsequent
 * transaction joins
 */
static void wait_current_trans_commit_start(struct btrfs_root *root,
					    struct btrfs_transaction *trans)
{
L
Li Zefan 已提交
1056
	wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit);
S
Sage Weil 已提交
1057 1058 1059 1060 1061 1062 1063 1064 1065
}

/*
 * wait for the current transaction to start and then become unblocked.
 * caller holds ref.
 */
static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
					 struct btrfs_transaction *trans)
{
L
Li Zefan 已提交
1066 1067
	wait_event(root->fs_info->transaction_wait,
		   trans->commit_done || (trans->in_commit && !trans->blocked));
S
Sage Weil 已提交
1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096
}

/*
 * commit transactions asynchronously. once btrfs_commit_transaction_async
 * returns, any subsequent transaction will not be allowed to join.
 */
struct btrfs_async_commit {
	struct btrfs_trans_handle *newtrans;
	struct btrfs_root *root;
	struct delayed_work work;
};

static void do_async_commit(struct work_struct *work)
{
	struct btrfs_async_commit *ac =
		container_of(work, struct btrfs_async_commit, work.work);

	btrfs_commit_transaction(ac->newtrans, ac->root);
	kfree(ac);
}

int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root,
				   int wait_for_unblock)
{
	struct btrfs_async_commit *ac;
	struct btrfs_transaction *cur_trans;

	ac = kmalloc(sizeof(*ac), GFP_NOFS);
T
Tsutomu Itoh 已提交
1097 1098
	if (!ac)
		return -ENOMEM;
S
Sage Weil 已提交
1099 1100 1101

	INIT_DELAYED_WORK(&ac->work, do_async_commit);
	ac->root = root;
1102
	ac->newtrans = btrfs_join_transaction(root);
1103 1104 1105 1106 1107
	if (IS_ERR(ac->newtrans)) {
		int err = PTR_ERR(ac->newtrans);
		kfree(ac);
		return err;
	}
S
Sage Weil 已提交
1108 1109 1110

	/* take transaction reference */
	cur_trans = trans->transaction;
1111
	atomic_inc(&cur_trans->use_count);
S
Sage Weil 已提交
1112 1113 1114 1115 1116 1117 1118 1119 1120 1121

	btrfs_end_transaction(trans, root);
	schedule_delayed_work(&ac->work, 0);

	/* wait for transaction to start and unblock */
	if (wait_for_unblock)
		wait_current_trans_commit_start_and_unblock(root, cur_trans);
	else
		wait_current_trans_commit_start(root, cur_trans);

1122 1123 1124 1125
	if (current->journal_info == trans)
		current->journal_info = NULL;

	put_transaction(cur_trans);
S
Sage Weil 已提交
1126 1127 1128 1129 1130 1131 1132 1133 1134 1135
	return 0;
}

/*
 * btrfs_transaction state sequence:
 *    in_commit = 0, blocked = 0  (initial)
 *    in_commit = 1, blocked = 1
 *    blocked = 0
 *    commit_done = 1
 */
C
Chris Mason 已提交
1136 1137 1138
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
			     struct btrfs_root *root)
{
1139
	unsigned long joined = 0;
C
Chris Mason 已提交
1140
	struct btrfs_transaction *cur_trans;
C
Chris Mason 已提交
1141
	struct btrfs_transaction *prev_trans = NULL;
C
Chris Mason 已提交
1142
	DEFINE_WAIT(wait);
1143
	int ret;
1144 1145
	int should_grow = 0;
	unsigned long now = get_seconds();
1146
	int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
C
Chris Mason 已提交
1147

1148 1149
	btrfs_run_ordered_operations(root, 0);

1150
	btrfs_trans_release_metadata(trans, root);
1151 1152
	trans->block_rsv = NULL;

1153 1154 1155 1156 1157 1158
	/* make a pass through all the delayed refs we have so far
	 * any runnings procs may add more while we are here
	 */
	ret = btrfs_run_delayed_refs(trans, root, 0);
	BUG_ON(ret);

1159
	cur_trans = trans->transaction;
1160 1161 1162 1163
	/*
	 * set the flushing flag so procs in this transaction have to
	 * start sending their work down.
	 */
1164
	cur_trans->delayed_refs.flushing = 1;
1165

1166
	ret = btrfs_run_delayed_refs(trans, root, 0);
1167 1168
	BUG_ON(ret);

J
Josef Bacik 已提交
1169
	spin_lock(&cur_trans->commit_lock);
1170
	if (cur_trans->in_commit) {
J
Josef Bacik 已提交
1171
		spin_unlock(&cur_trans->commit_lock);
1172
		atomic_inc(&cur_trans->use_count);
C
Chris Mason 已提交
1173
		btrfs_end_transaction(trans, root);
C
Chris Mason 已提交
1174

1175
		wait_for_commit(root, cur_trans);
1176

C
Chris Mason 已提交
1177
		put_transaction(cur_trans);
1178

C
Chris Mason 已提交
1179 1180
		return 0;
	}
1181

C
Chris Mason 已提交
1182
	trans->transaction->in_commit = 1;
1183
	trans->transaction->blocked = 1;
J
Josef Bacik 已提交
1184
	spin_unlock(&cur_trans->commit_lock);
S
Sage Weil 已提交
1185 1186
	wake_up(&root->fs_info->transaction_blocked_wait);

J
Josef Bacik 已提交
1187
	spin_lock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1188 1189 1190 1191
	if (cur_trans->list.prev != &root->fs_info->trans_list) {
		prev_trans = list_entry(cur_trans->list.prev,
					struct btrfs_transaction, list);
		if (!prev_trans->commit_done) {
1192
			atomic_inc(&prev_trans->use_count);
J
Josef Bacik 已提交
1193
			spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1194 1195 1196

			wait_for_commit(root, prev_trans);

1197
			put_transaction(prev_trans);
J
Josef Bacik 已提交
1198 1199
		} else {
			spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1200
		}
J
Josef Bacik 已提交
1201 1202
	} else {
		spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1203
	}
1204

1205 1206 1207
	if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
		should_grow = 1;

1208
	do {
1209
		int snap_pending = 0;
J
Josef Bacik 已提交
1210

1211
		joined = cur_trans->num_joined;
1212 1213 1214
		if (!list_empty(&trans->transaction->pending_snapshots))
			snap_pending = 1;

C
Chris Mason 已提交
1215
		WARN_ON(cur_trans != trans->transaction);
1216

1217
		if (flush_on_commit || snap_pending) {
Y
Yan, Zheng 已提交
1218 1219
			btrfs_start_delalloc_inodes(root, 1);
			ret = btrfs_wait_ordered_extents(root, 0, 1);
1220
			BUG_ON(ret);
1221 1222
		}

1223 1224 1225
		ret = btrfs_run_delayed_items(trans, root);
		BUG_ON(ret);

1226 1227 1228 1229 1230 1231 1232 1233 1234
		/*
		 * rename don't use btrfs_join_transaction, so, once we
		 * set the transaction to blocked above, we aren't going
		 * to get any new ordered operations.  We can safely run
		 * it here and no for sure that nothing new will be added
		 * to the list
		 */
		btrfs_run_ordered_operations(root, 1);

1235 1236 1237
		prepare_to_wait(&cur_trans->writer_wait, &wait,
				TASK_UNINTERRUPTIBLE);

1238
		if (atomic_read(&cur_trans->num_writers) > 1)
1239 1240 1241
			schedule_timeout(MAX_SCHEDULE_TIMEOUT);
		else if (should_grow)
			schedule_timeout(1);
1242 1243

		finish_wait(&cur_trans->writer_wait, &wait);
1244
	} while (atomic_read(&cur_trans->num_writers) > 1 ||
1245
		 (should_grow && cur_trans->num_joined != joined));
1246

1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257
	/*
	 * Ok now we need to make sure to block out any other joins while we
	 * commit the transaction.  We could have started a join before setting
	 * no_join so make sure to wait for num_writers to == 1 again.
	 */
	spin_lock(&root->fs_info->trans_lock);
	root->fs_info->trans_no_join = 1;
	spin_unlock(&root->fs_info->trans_lock);
	wait_event(cur_trans->writer_wait,
		   atomic_read(&cur_trans->num_writers) == 1);

C
Chris Mason 已提交
1258 1259 1260 1261 1262 1263 1264
	/*
	 * the reloc mutex makes sure that we stop
	 * the balancing code from coming in and moving
	 * extents around in the middle of the commit
	 */
	mutex_lock(&root->fs_info->reloc_mutex);

1265
	ret = btrfs_run_delayed_items(trans, root);
1266 1267
	BUG_ON(ret);

1268
	ret = create_pending_snapshots(trans, root->fs_info);
1269 1270
	BUG_ON(ret);

1271 1272 1273
	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
	BUG_ON(ret);

1274 1275 1276 1277 1278 1279
	/*
	 * make sure none of the code above managed to slip in a
	 * delayed item
	 */
	btrfs_assert_delayed_root_empty(root);

C
Chris Mason 已提交
1280
	WARN_ON(cur_trans != trans->transaction);
C
Chris Mason 已提交
1281

A
Arne Jansen 已提交
1282
	btrfs_scrub_pause(root);
1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297
	/* btrfs_commit_tree_roots is responsible for getting the
	 * various roots consistent with each other.  Every pointer
	 * in the tree of tree roots has to point to the most up to date
	 * root for every subvolume and other tree.  So, we have to keep
	 * the tree logging code from jumping in and changing any
	 * of the trees.
	 *
	 * At this point in the commit, there can't be any tree-log
	 * writers, but a little lower down we drop the trans mutex
	 * and let new people in.  By holding the tree_log_mutex
	 * from now until after the super is written, we avoid races
	 * with the tree-log code.
	 */
	mutex_lock(&root->fs_info->tree_log_mutex);

1298
	ret = commit_fs_roots(trans, root);
1299 1300
	BUG_ON(ret);

1301
	/* commit_fs_roots gets rid of all the tree log roots, it is now
1302 1303 1304 1305
	 * safe to free the root of tree log roots
	 */
	btrfs_free_log_root_tree(trans, root->fs_info);

1306
	ret = commit_cowonly_roots(trans, root);
C
Chris Mason 已提交
1307
	BUG_ON(ret);
1308

1309 1310
	btrfs_prepare_extent_commit(trans, root);

C
Chris Mason 已提交
1311
	cur_trans = root->fs_info->running_transaction;
1312 1313 1314

	btrfs_set_root_node(&root->fs_info->tree_root->root_item,
			    root->fs_info->tree_root->node);
J
Josef Bacik 已提交
1315
	switch_commit_root(root->fs_info->tree_root);
1316 1317 1318

	btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
			    root->fs_info->chunk_root->node);
J
Josef Bacik 已提交
1319
	switch_commit_root(root->fs_info->chunk_root);
1320 1321

	update_super_roots(root);
1322 1323

	if (!root->fs_info->log_root_recovering) {
1324 1325
		btrfs_set_super_log_root(root->fs_info->super_copy, 0);
		btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
1326 1327
	}

1328 1329
	memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
	       sizeof(*root->fs_info->super_copy));
C
Chris Mason 已提交
1330

1331
	trans->transaction->blocked = 0;
J
Josef Bacik 已提交
1332 1333 1334 1335
	spin_lock(&root->fs_info->trans_lock);
	root->fs_info->running_transaction = NULL;
	root->fs_info->trans_no_join = 0;
	spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1336
	mutex_unlock(&root->fs_info->reloc_mutex);
1337

1338
	wake_up(&root->fs_info->transaction_wait);
1339

C
Chris Mason 已提交
1340 1341
	ret = btrfs_write_and_wait_transaction(trans, root);
	BUG_ON(ret);
Y
Yan Zheng 已提交
1342
	write_ctree_super(trans, root, 0);
1343

1344 1345 1346 1347 1348 1349
	/*
	 * the super is written, we can safely allow the tree-loggers
	 * to go about their business
	 */
	mutex_unlock(&root->fs_info->tree_log_mutex);

1350
	btrfs_finish_extent_commit(trans, root);
1351

C
Chris Mason 已提交
1352
	cur_trans->commit_done = 1;
1353

1354
	root->fs_info->last_trans_committed = cur_trans->transid;
J
Josef Bacik 已提交
1355

C
Chris Mason 已提交
1356
	wake_up(&cur_trans->commit_wait);
1357

J
Josef Bacik 已提交
1358
	spin_lock(&root->fs_info->trans_lock);
1359
	list_del_init(&cur_trans->list);
J
Josef Bacik 已提交
1360 1361
	spin_unlock(&root->fs_info->trans_lock);

C
Chris Mason 已提交
1362
	put_transaction(cur_trans);
C
Chris Mason 已提交
1363
	put_transaction(cur_trans);
1364

1365 1366
	trace_btrfs_transaction_commit(root);

A
Arne Jansen 已提交
1367 1368
	btrfs_scrub_continue(root);

J
Josef Bacik 已提交
1369 1370 1371
	if (current->journal_info == trans)
		current->journal_info = NULL;

C
Chris Mason 已提交
1372
	kmem_cache_free(btrfs_trans_handle_cachep, trans);
Y
Yan, Zheng 已提交
1373 1374 1375 1376

	if (current != root->fs_info->transaction_kthread)
		btrfs_run_delayed_iputs(root);

C
Chris Mason 已提交
1377 1378 1379
	return ret;
}

C
Chris Mason 已提交
1380 1381 1382
/*
 * interface function to delete all the snapshots we have scheduled for deletion
 */
1383 1384
int btrfs_clean_old_snapshots(struct btrfs_root *root)
{
1385 1386 1387
	LIST_HEAD(list);
	struct btrfs_fs_info *fs_info = root->fs_info;

J
Josef Bacik 已提交
1388
	spin_lock(&fs_info->trans_lock);
1389
	list_splice_init(&fs_info->dead_roots, &list);
J
Josef Bacik 已提交
1390
	spin_unlock(&fs_info->trans_lock);
1391

1392 1393
	while (!list_empty(&list)) {
		root = list_entry(list.next, struct btrfs_root, root_list);
1394 1395
		list_del(&root->root_list);

1396 1397
		btrfs_kill_all_delayed_nodes(root);

1398 1399
		if (btrfs_header_backref_rev(root->node) <
		    BTRFS_MIXED_BACKREF_REV)
A
Arne Jansen 已提交
1400
			btrfs_drop_snapshot(root, NULL, 0, 0);
1401
		else
A
Arne Jansen 已提交
1402
			btrfs_drop_snapshot(root, NULL, 1, 0);
1403 1404 1405
	}
	return 0;
}