transaction.c 39.4 KB
Newer Older
C
Chris Mason 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

C
Chris Mason 已提交
19
#include <linux/fs.h>
20
#include <linux/slab.h>
C
Chris Mason 已提交
21
#include <linux/sched.h>
22
#include <linux/writeback.h>
23
#include <linux/pagemap.h>
24
#include <linux/blkdev.h>
C
Chris Mason 已提交
25 26 27
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
28
#include "locking.h"
29
#include "tree-log.h"
30
#include "inode-map.h"
C
Chris Mason 已提交
31

32 33
#define BTRFS_ROOT_TRANS_TAG 0

34
void put_transaction(struct btrfs_transaction *transaction)
C
Chris Mason 已提交
35
{
36 37
	WARN_ON(atomic_read(&transaction->use_count) == 0);
	if (atomic_dec_and_test(&transaction->use_count)) {
J
Josef Bacik 已提交
38
		BUG_ON(!list_empty(&transaction->list));
39 40
		WARN_ON(transaction->delayed_refs.root.rb_node);
		WARN_ON(!list_empty(&transaction->delayed_refs.seq_head));
C
Chris Mason 已提交
41 42
		memset(transaction, 0, sizeof(*transaction));
		kmem_cache_free(btrfs_transaction_cachep, transaction);
C
Chris Mason 已提交
43
	}
C
Chris Mason 已提交
44 45
}

J
Josef Bacik 已提交
46 47 48 49 50 51
static noinline void switch_commit_root(struct btrfs_root *root)
{
	free_extent_buffer(root->commit_root);
	root->commit_root = btrfs_root_node(root);
}

C
Chris Mason 已提交
52 53 54
/*
 * either allocate a new transaction or hop into the existing one
 */
J
Josef Bacik 已提交
55
static noinline int join_transaction(struct btrfs_root *root, int nofail)
C
Chris Mason 已提交
56 57
{
	struct btrfs_transaction *cur_trans;
J
Josef Bacik 已提交
58 59

	spin_lock(&root->fs_info->trans_lock);
60
loop:
61 62 63 64 65 66
	/* The file system has been taken offline. No new transactions. */
	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
		spin_unlock(&root->fs_info->trans_lock);
		return -EROFS;
	}

J
Josef Bacik 已提交
67 68 69 70 71 72 73
	if (root->fs_info->trans_no_join) {
		if (!nofail) {
			spin_unlock(&root->fs_info->trans_lock);
			return -EBUSY;
		}
	}

C
Chris Mason 已提交
74
	cur_trans = root->fs_info->running_transaction;
J
Josef Bacik 已提交
75
	if (cur_trans) {
76 77
		if (cur_trans->aborted)
			return cur_trans->aborted;
J
Josef Bacik 已提交
78
		atomic_inc(&cur_trans->use_count);
79
		atomic_inc(&cur_trans->num_writers);
80
		cur_trans->num_joined++;
J
Josef Bacik 已提交
81 82
		spin_unlock(&root->fs_info->trans_lock);
		return 0;
C
Chris Mason 已提交
83
	}
J
Josef Bacik 已提交
84 85 86 87 88
	spin_unlock(&root->fs_info->trans_lock);

	cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
	if (!cur_trans)
		return -ENOMEM;
89

J
Josef Bacik 已提交
90 91
	spin_lock(&root->fs_info->trans_lock);
	if (root->fs_info->running_transaction) {
92 93 94 95
		/*
		 * someone started a transaction after we unlocked.  Make sure
		 * to redo the trans_no_join checks above
		 */
J
Josef Bacik 已提交
96 97
		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
		cur_trans = root->fs_info->running_transaction;
98
		goto loop;
C
Chris Mason 已提交
99
	}
100

J
Josef Bacik 已提交
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
	atomic_set(&cur_trans->num_writers, 1);
	cur_trans->num_joined = 0;
	init_waitqueue_head(&cur_trans->writer_wait);
	init_waitqueue_head(&cur_trans->commit_wait);
	cur_trans->in_commit = 0;
	cur_trans->blocked = 0;
	/*
	 * One for this trans handle, one so it will live on until we
	 * commit the transaction.
	 */
	atomic_set(&cur_trans->use_count, 2);
	cur_trans->commit_done = 0;
	cur_trans->start_time = get_seconds();

	cur_trans->delayed_refs.root = RB_ROOT;
	cur_trans->delayed_refs.num_entries = 0;
	cur_trans->delayed_refs.num_heads_ready = 0;
	cur_trans->delayed_refs.num_heads = 0;
	cur_trans->delayed_refs.flushing = 0;
	cur_trans->delayed_refs.run_delayed_start = 0;
121
	cur_trans->delayed_refs.seq = 1;
122
	init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
J
Josef Bacik 已提交
123 124
	spin_lock_init(&cur_trans->commit_lock);
	spin_lock_init(&cur_trans->delayed_refs.lock);
125
	INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
J
Josef Bacik 已提交
126 127 128 129

	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
	list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
	extent_io_tree_init(&cur_trans->dirty_pages,
C
Chris Mason 已提交
130
			     root->fs_info->btree_inode->i_mapping);
J
Josef Bacik 已提交
131 132 133
	root->fs_info->generation++;
	cur_trans->transid = root->fs_info->generation;
	root->fs_info->running_transaction = cur_trans;
134
	cur_trans->aborted = 0;
J
Josef Bacik 已提交
135
	spin_unlock(&root->fs_info->trans_lock);
136

C
Chris Mason 已提交
137 138 139
	return 0;
}

C
Chris Mason 已提交
140
/*
C
Chris Mason 已提交
141 142 143 144
 * this does all the record keeping required to make sure that a reference
 * counted root is properly recorded in a given transaction.  This is required
 * to make sure the old root from before we joined the transaction is deleted
 * when the transaction commits
C
Chris Mason 已提交
145
 */
C
Chris Mason 已提交
146
static int record_root_in_trans(struct btrfs_trans_handle *trans,
J
Josef Bacik 已提交
147
			       struct btrfs_root *root)
148
{
149
	if (root->ref_cows && root->last_trans < trans->transid) {
150
		WARN_ON(root == root->fs_info->extent_root);
151 152
		WARN_ON(root->commit_root != root->node);

C
Chris Mason 已提交
153 154 155 156 157 158 159 160 161 162 163 164
		/*
		 * see below for in_trans_setup usage rules
		 * we have the reloc mutex held now, so there
		 * is only one writer in this function
		 */
		root->in_trans_setup = 1;

		/* make sure readers find in_trans_setup before
		 * they find our root->last_trans update
		 */
		smp_wmb();

J
Josef Bacik 已提交
165 166 167 168 169
		spin_lock(&root->fs_info->fs_roots_radix_lock);
		if (root->last_trans == trans->transid) {
			spin_unlock(&root->fs_info->fs_roots_radix_lock);
			return 0;
		}
170 171 172
		radix_tree_tag_set(&root->fs_info->fs_roots_radix,
			   (unsigned long)root->root_key.objectid,
			   BTRFS_ROOT_TRANS_TAG);
J
Josef Bacik 已提交
173
		spin_unlock(&root->fs_info->fs_roots_radix_lock);
C
Chris Mason 已提交
174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194
		root->last_trans = trans->transid;

		/* this is pretty tricky.  We don't want to
		 * take the relocation lock in btrfs_record_root_in_trans
		 * unless we're really doing the first setup for this root in
		 * this transaction.
		 *
		 * Normally we'd use root->last_trans as a flag to decide
		 * if we want to take the expensive mutex.
		 *
		 * But, we have to set root->last_trans before we
		 * init the relocation root, otherwise, we trip over warnings
		 * in ctree.c.  The solution used here is to flag ourselves
		 * with root->in_trans_setup.  When this is 1, we're still
		 * fixing up the reloc trees and everyone must wait.
		 *
		 * When this is zero, they can trust root->last_trans and fly
		 * through btrfs_record_root_in_trans without having to take the
		 * lock.  smp_wmb() makes sure that all the writes above are
		 * done before we pop in the zero below
		 */
195
		btrfs_init_reloc_root(trans, root);
C
Chris Mason 已提交
196 197
		smp_wmb();
		root->in_trans_setup = 0;
198 199 200
	}
	return 0;
}
201

C
Chris Mason 已提交
202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224

int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root)
{
	if (!root->ref_cows)
		return 0;

	/*
	 * see record_root_in_trans for comments about in_trans_setup usage
	 * and barriers
	 */
	smp_rmb();
	if (root->last_trans == trans->transid &&
	    !root->in_trans_setup)
		return 0;

	mutex_lock(&root->fs_info->reloc_mutex);
	record_root_in_trans(trans, root);
	mutex_unlock(&root->fs_info->reloc_mutex);

	return 0;
}

C
Chris Mason 已提交
225 226 227 228
/* wait for commit against the current transaction to become unblocked
 * when this is done, it is safe to start a new transaction, but the current
 * transaction might not be fully on disk.
 */
C
Chris Mason 已提交
229
static void wait_current_trans(struct btrfs_root *root)
C
Chris Mason 已提交
230
{
231
	struct btrfs_transaction *cur_trans;
C
Chris Mason 已提交
232

J
Josef Bacik 已提交
233
	spin_lock(&root->fs_info->trans_lock);
234
	cur_trans = root->fs_info->running_transaction;
C
Chris Mason 已提交
235
	if (cur_trans && cur_trans->blocked) {
236
		atomic_inc(&cur_trans->use_count);
J
Josef Bacik 已提交
237
		spin_unlock(&root->fs_info->trans_lock);
L
Li Zefan 已提交
238 239 240

		wait_event(root->fs_info->transaction_wait,
			   !cur_trans->blocked);
241
		put_transaction(cur_trans);
J
Josef Bacik 已提交
242 243
	} else {
		spin_unlock(&root->fs_info->trans_lock);
244
	}
C
Chris Mason 已提交
245 246
}

247 248 249 250
enum btrfs_trans_type {
	TRANS_START,
	TRANS_JOIN,
	TRANS_USERSPACE,
251
	TRANS_JOIN_NOLOCK,
252 253
};

254 255
static int may_wait_transaction(struct btrfs_root *root, int type)
{
J
Josef Bacik 已提交
256 257 258 259 260 261 262 263
	if (root->fs_info->log_root_recovering)
		return 0;

	if (type == TRANS_USERSPACE)
		return 1;

	if (type == TRANS_START &&
	    !atomic_read(&root->fs_info->open_ioctl_trans))
264
		return 1;
J
Josef Bacik 已提交
265

266 267 268
	return 0;
}

269
static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
270
						    u64 num_items, int type)
C
Chris Mason 已提交
271
{
272 273
	struct btrfs_trans_handle *h;
	struct btrfs_transaction *cur_trans;
274
	u64 num_bytes = 0;
C
Chris Mason 已提交
275
	int ret;
L
liubo 已提交
276 277 278

	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
		return ERR_PTR(-EROFS);
279 280 281 282 283 284 285 286 287

	if (current->journal_info) {
		WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
		h = current->journal_info;
		h->use_count++;
		h->orig_rsv = h->block_rsv;
		h->block_rsv = NULL;
		goto got_it;
	}
288 289 290 291 292 293 294

	/*
	 * Do the reservation before we join the transaction so we can do all
	 * the appropriate flushing if need be.
	 */
	if (num_items > 0 && root != root->fs_info->chunk_root) {
		num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
295
		ret = btrfs_block_rsv_add(root,
296 297 298 299 300
					  &root->fs_info->trans_block_rsv,
					  num_bytes);
		if (ret)
			return ERR_PTR(ret);
	}
301 302 303 304
again:
	h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
	if (!h)
		return ERR_PTR(-ENOMEM);
C
Chris Mason 已提交
305

306
	if (may_wait_transaction(root, type))
C
Chris Mason 已提交
307
		wait_current_trans(root);
308

J
Josef Bacik 已提交
309 310 311 312 313 314
	do {
		ret = join_transaction(root, type == TRANS_JOIN_NOLOCK);
		if (ret == -EBUSY)
			wait_current_trans(root);
	} while (ret == -EBUSY);

T
Tsutomu Itoh 已提交
315
	if (ret < 0) {
316
		kmem_cache_free(btrfs_trans_handle_cachep, h);
T
Tsutomu Itoh 已提交
317 318
		return ERR_PTR(ret);
	}
319

320 321 322 323
	cur_trans = root->fs_info->running_transaction;

	h->transid = cur_trans->transid;
	h->transaction = cur_trans;
C
Chris Mason 已提交
324
	h->blocks_used = 0;
325
	h->bytes_reserved = 0;
326
	h->delayed_ref_updates = 0;
327
	h->use_count = 1;
328
	h->block_rsv = NULL;
329
	h->orig_rsv = NULL;
330
	h->aborted = 0;
331

332 333 334 335 336 337
	smp_mb();
	if (cur_trans->blocked && may_wait_transaction(root, type)) {
		btrfs_commit_transaction(h, root);
		goto again;
	}

338
	if (num_bytes) {
J
Josef Bacik 已提交
339
		trace_btrfs_space_reservation(root->fs_info, "transaction",
340 341
					      (u64)(unsigned long)h,
					      num_bytes, 1);
342 343
		h->block_rsv = &root->fs_info->trans_block_rsv;
		h->bytes_reserved = num_bytes;
344
	}
J
Josef Bacik 已提交
345

346
got_it:
J
Josef Bacik 已提交
347
	btrfs_record_root_in_trans(h, root);
348 349 350

	if (!current->journal_info && type != TRANS_USERSPACE)
		current->journal_info = h;
C
Chris Mason 已提交
351 352 353
	return h;
}

354
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
355
						   int num_items)
356
{
357
	return start_transaction(root, num_items, TRANS_START);
358
}
359
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
360
{
361
	return start_transaction(root, 0, TRANS_JOIN);
362 363
}

364
struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
365 366 367 368
{
	return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
}

369
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
370
{
371
	return start_transaction(root, 0, TRANS_USERSPACE);
372 373
}

C
Chris Mason 已提交
374
/* wait for a transaction commit to be fully complete */
375
static noinline void wait_for_commit(struct btrfs_root *root,
376 377
				    struct btrfs_transaction *commit)
{
L
Li Zefan 已提交
378
	wait_event(commit->commit_wait, commit->commit_done);
379 380
}

381 382 383 384 385 386 387 388
int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
{
	struct btrfs_transaction *cur_trans = NULL, *t;
	int ret;

	ret = 0;
	if (transid) {
		if (transid <= root->fs_info->last_trans_committed)
J
Josef Bacik 已提交
389
			goto out;
390 391

		/* find specified transaction */
J
Josef Bacik 已提交
392
		spin_lock(&root->fs_info->trans_lock);
393 394 395
		list_for_each_entry(t, &root->fs_info->trans_list, list) {
			if (t->transid == transid) {
				cur_trans = t;
J
Josef Bacik 已提交
396
				atomic_inc(&cur_trans->use_count);
397 398 399 400 401
				break;
			}
			if (t->transid > transid)
				break;
		}
J
Josef Bacik 已提交
402
		spin_unlock(&root->fs_info->trans_lock);
403 404
		ret = -EINVAL;
		if (!cur_trans)
J
Josef Bacik 已提交
405
			goto out;  /* bad transid */
406 407
	} else {
		/* find newest transaction that is committing | committed */
J
Josef Bacik 已提交
408
		spin_lock(&root->fs_info->trans_lock);
409 410 411 412
		list_for_each_entry_reverse(t, &root->fs_info->trans_list,
					    list) {
			if (t->in_commit) {
				if (t->commit_done)
413
					break;
414
				cur_trans = t;
J
Josef Bacik 已提交
415
				atomic_inc(&cur_trans->use_count);
416 417 418
				break;
			}
		}
J
Josef Bacik 已提交
419
		spin_unlock(&root->fs_info->trans_lock);
420
		if (!cur_trans)
J
Josef Bacik 已提交
421
			goto out;  /* nothing committing|committed */
422 423 424 425 426 427
	}

	wait_for_commit(root, cur_trans);

	put_transaction(cur_trans);
	ret = 0;
J
Josef Bacik 已提交
428
out:
429 430 431
	return ret;
}

C
Chris Mason 已提交
432 433
void btrfs_throttle(struct btrfs_root *root)
{
J
Josef Bacik 已提交
434
	if (!atomic_read(&root->fs_info->open_ioctl_trans))
435
		wait_current_trans(root);
C
Chris Mason 已提交
436 437
}

438 439 440 441
static int should_end_transaction(struct btrfs_trans_handle *trans,
				  struct btrfs_root *root)
{
	int ret;
442 443

	ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
444 445 446 447 448 449 450
	return ret ? 1 : 0;
}

int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root)
{
	struct btrfs_transaction *cur_trans = trans->transaction;
451
	struct btrfs_block_rsv *rsv = trans->block_rsv;
452
	int updates;
453
	int err;
454

J
Josef Bacik 已提交
455
	smp_mb();
456 457 458
	if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
		return 1;

459 460 461 462 463 464
	/*
	 * We need to do this in case we're deleting csums so the global block
	 * rsv get's used instead of the csum block rsv.
	 */
	trans->block_rsv = NULL;

465 466
	updates = trans->delayed_ref_updates;
	trans->delayed_ref_updates = 0;
467 468 469 470 471
	if (updates) {
		err = btrfs_run_delayed_refs(trans, root, updates);
		if (err) /* Error code will also eval true */
			return err;
	}
472

473 474
	trans->block_rsv = rsv;

475 476 477
	return should_end_transaction(trans, root);
}

478
static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
479
			  struct btrfs_root *root, int throttle, int lock)
C
Chris Mason 已提交
480
{
481
	struct btrfs_transaction *cur_trans = trans->transaction;
482
	struct btrfs_fs_info *info = root->fs_info;
483 484
	int count = 0;

485 486 487 488 489
	if (--trans->use_count) {
		trans->block_rsv = trans->orig_rsv;
		return 0;
	}

490
	btrfs_trans_release_metadata(trans, root);
491
	trans->block_rsv = NULL;
492
	while (count < 2) {
493 494 495 496 497 498 499 500 501 502
		unsigned long cur = trans->delayed_ref_updates;
		trans->delayed_ref_updates = 0;
		if (cur &&
		    trans->transaction->delayed_refs.num_heads_ready > 64) {
			trans->delayed_ref_updates = 0;
			btrfs_run_delayed_refs(trans, root, cur);
		} else {
			break;
		}
		count++;
503 504
	}

J
Josef Bacik 已提交
505 506
	if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
	    should_end_transaction(trans, root)) {
507
		trans->transaction->blocked = 1;
J
Josef Bacik 已提交
508 509
		smp_wmb();
	}
510

511
	if (lock && cur_trans->blocked && !cur_trans->in_commit) {
512 513 514 515 516 517 518
		if (throttle) {
			/*
			 * We may race with somebody else here so end up having
			 * to call end_transaction on ourselves again, so inc
			 * our use_count.
			 */
			trans->use_count++;
519
			return btrfs_commit_transaction(trans, root);
520
		} else {
521
			wake_up_process(info->transaction_kthread);
522
		}
523 524 525
	}

	WARN_ON(cur_trans != info->running_transaction);
526 527
	WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
	atomic_dec(&cur_trans->num_writers);
528

529
	smp_mb();
C
Chris Mason 已提交
530 531 532
	if (waitqueue_active(&cur_trans->writer_wait))
		wake_up(&cur_trans->writer_wait);
	put_transaction(cur_trans);
J
Josef Bacik 已提交
533 534 535

	if (current->journal_info == trans)
		current->journal_info = NULL;
C
Chris Mason 已提交
536
	memset(trans, 0, sizeof(*trans));
C
Chris Mason 已提交
537
	kmem_cache_free(btrfs_trans_handle_cachep, trans);
538

Y
Yan, Zheng 已提交
539 540 541
	if (throttle)
		btrfs_run_delayed_iputs(root);

542 543 544 545 546
	if (trans->aborted ||
	    root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
		return -EIO;
	}

C
Chris Mason 已提交
547 548 549
	return 0;
}

550 551 552
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root)
{
553 554 555 556 557 558
	int ret;

	ret = __btrfs_end_transaction(trans, root, 0, 1);
	if (ret)
		return ret;
	return 0;
559 560 561 562 563
}

int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root)
{
564 565 566 567 568 569
	int ret;

	ret = __btrfs_end_transaction(trans, root, 1, 1);
	if (ret)
		return ret;
	return 0;
570 571 572 573 574
}

int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root)
{
575 576 577 578 579 580 581 582 583 584 585 586
	int ret;

	ret = __btrfs_end_transaction(trans, root, 0, 0);
	if (ret)
		return ret;
	return 0;
}

int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
				struct btrfs_root *root)
{
	return __btrfs_end_transaction(trans, root, 1, 1);
587 588
}

C
Chris Mason 已提交
589 590 591
/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
592
 * those extents are sent to disk but does not wait on them
C
Chris Mason 已提交
593
 */
594
int btrfs_write_marked_extents(struct btrfs_root *root,
595
			       struct extent_io_tree *dirty_pages, int mark)
C
Chris Mason 已提交
596
{
597
	int err = 0;
598
	int werr = 0;
J
Josef Bacik 已提交
599
	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
600
	u64 start = 0;
601
	u64 end;
602

J
Josef Bacik 已提交
603 604 605 606 607 608 609 610 611
	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
				      mark)) {
		convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
				   GFP_NOFS);
		err = filemap_fdatawrite_range(mapping, start, end);
		if (err)
			werr = err;
		cond_resched();
		start = end + 1;
612
	}
613 614 615 616 617 618 619 620 621 622 623 624
	if (err)
		werr = err;
	return werr;
}

/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
 * those extents are on disk for transaction or log commit.  We wait
 * on all the pages and clear them from the dirty pages state tree
 */
int btrfs_wait_marked_extents(struct btrfs_root *root,
625
			      struct extent_io_tree *dirty_pages, int mark)
626 627 628
{
	int err = 0;
	int werr = 0;
J
Josef Bacik 已提交
629
	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
630 631
	u64 start = 0;
	u64 end;
632

J
Josef Bacik 已提交
633 634 635 636 637 638 639 640
	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
				      EXTENT_NEED_WAIT)) {
		clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
		err = filemap_fdatawait_range(mapping, start, end);
		if (err)
			werr = err;
		cond_resched();
		start = end + 1;
641
	}
642 643 644
	if (err)
		werr = err;
	return werr;
C
Chris Mason 已提交
645 646
}

647 648 649 650 651 652
/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
 * those extents are on disk for transaction or log commit
 */
int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
653
				struct extent_io_tree *dirty_pages, int mark)
654 655 656 657
{
	int ret;
	int ret2;

658 659
	ret = btrfs_write_marked_extents(root, dirty_pages, mark);
	ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
660 661 662 663 664 665

	if (ret)
		return ret;
	if (ret2)
		return ret2;
	return 0;
666 667
}

668 669 670 671 672 673 674 675 676
int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
				     struct btrfs_root *root)
{
	if (!trans || !trans->transaction) {
		struct inode *btree_inode;
		btree_inode = root->fs_info->btree_inode;
		return filemap_write_and_wait(btree_inode->i_mapping);
	}
	return btrfs_write_and_wait_marked_extents(root,
677 678
					   &trans->transaction->dirty_pages,
					   EXTENT_DIRTY);
679 680
}

C
Chris Mason 已提交
681 682 683 684 685 686 687 688 689 690
/*
 * this is used to update the root pointer in the tree of tree roots.
 *
 * But, in the case of the extent allocation tree, updating the root
 * pointer may allocate blocks which may change the root of the extent
 * allocation tree.
 *
 * So, this loops and repeats and makes sure the cowonly root didn't
 * change while the root pointer was being updated in the metadata.
 */
691 692
static int update_cowonly_root(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root)
C
Chris Mason 已提交
693 694
{
	int ret;
695
	u64 old_root_bytenr;
696
	u64 old_root_used;
697
	struct btrfs_root *tree_root = root->fs_info->tree_root;
C
Chris Mason 已提交
698

699
	old_root_used = btrfs_root_used(&root->root_item);
700
	btrfs_write_dirty_block_groups(trans, root);
701

C
Chris Mason 已提交
702
	while (1) {
703
		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
704 705
		if (old_root_bytenr == root->node->start &&
		    old_root_used == btrfs_root_used(&root->root_item))
C
Chris Mason 已提交
706
			break;
707

708
		btrfs_set_root_node(&root->root_item, root->node);
C
Chris Mason 已提交
709
		ret = btrfs_update_root(trans, tree_root,
710 711
					&root->root_key,
					&root->root_item);
712 713
		if (ret)
			return ret;
714

715
		old_root_used = btrfs_root_used(&root->root_item);
716
		ret = btrfs_write_dirty_block_groups(trans, root);
717 718
		if (ret)
			return ret;
719
	}
720 721 722 723

	if (root != root->fs_info->extent_root)
		switch_commit_root(root);

724 725 726
	return 0;
}

C
Chris Mason 已提交
727 728
/*
 * update all the cowonly tree roots on disk
729 730 731 732
 *
 * The error handling in this function may not be obvious. Any of the
 * failures will cause the file system to go offline. We still need
 * to clean up the delayed refs.
C
Chris Mason 已提交
733
 */
734 735
static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
					 struct btrfs_root *root)
736 737 738
{
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct list_head *next;
739
	struct extent_buffer *eb;
740
	int ret;
741

742
	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
743 744
	if (ret)
		return ret;
745

746
	eb = btrfs_lock_root_node(fs_info->tree_root);
747 748
	ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
			      0, &eb);
749 750
	btrfs_tree_unlock(eb);
	free_extent_buffer(eb);
751

752 753 754
	if (ret)
		return ret;

755
	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
756 757
	if (ret)
		return ret;
758

C
Chris Mason 已提交
759
	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
760 761 762
		next = fs_info->dirty_cowonly_roots.next;
		list_del_init(next);
		root = list_entry(next, struct btrfs_root, dirty_list);
763

764 765 766
		ret = update_cowonly_root(trans, root);
		if (ret)
			return ret;
C
Chris Mason 已提交
767
	}
768 769 770 771 772

	down_write(&fs_info->extent_commit_sem);
	switch_commit_root(fs_info->extent_root);
	up_write(&fs_info->extent_commit_sem);

C
Chris Mason 已提交
773 774 775
	return 0;
}

C
Chris Mason 已提交
776 777 778 779 780
/*
 * dead roots are old snapshots that need to be deleted.  This allocates
 * a dirty root struct and adds it into the list of dead roots that need to
 * be deleted
 */
781
int btrfs_add_dead_root(struct btrfs_root *root)
782
{
J
Josef Bacik 已提交
783
	spin_lock(&root->fs_info->trans_lock);
784
	list_add(&root->root_list, &root->fs_info->dead_roots);
J
Josef Bacik 已提交
785
	spin_unlock(&root->fs_info->trans_lock);
786 787 788
	return 0;
}

C
Chris Mason 已提交
789
/*
790
 * update all the cowonly tree roots on disk
C
Chris Mason 已提交
791
 */
792 793
static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
				    struct btrfs_root *root)
794 795
{
	struct btrfs_root *gang[8];
796
	struct btrfs_fs_info *fs_info = root->fs_info;
797 798
	int i;
	int ret;
799 800
	int err = 0;

J
Josef Bacik 已提交
801
	spin_lock(&fs_info->fs_roots_radix_lock);
C
Chris Mason 已提交
802
	while (1) {
803 804
		ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
						 (void **)gang, 0,
805 806 807 808 809 810
						 ARRAY_SIZE(gang),
						 BTRFS_ROOT_TRANS_TAG);
		if (ret == 0)
			break;
		for (i = 0; i < ret; i++) {
			root = gang[i];
811 812 813
			radix_tree_tag_clear(&fs_info->fs_roots_radix,
					(unsigned long)root->root_key.objectid,
					BTRFS_ROOT_TRANS_TAG);
J
Josef Bacik 已提交
814
			spin_unlock(&fs_info->fs_roots_radix_lock);
Y
Yan Zheng 已提交
815

816
			btrfs_free_log(trans, root);
817
			btrfs_update_reloc_root(trans, root);
818
			btrfs_orphan_commit_root(trans, root);
819

820 821
			btrfs_save_ino_cache(root, trans);

822 823 824 825
			/* see comments in should_cow_block() */
			root->force_cow = 0;
			smp_wmb();

826
			if (root->commit_root != root->node) {
827
				mutex_lock(&root->fs_commit_mutex);
J
Josef Bacik 已提交
828
				switch_commit_root(root);
829 830 831
				btrfs_unpin_free_ino(root);
				mutex_unlock(&root->fs_commit_mutex);

832 833 834
				btrfs_set_root_node(&root->root_item,
						    root->node);
			}
835 836

			err = btrfs_update_root(trans, fs_info->tree_root,
837 838
						&root->root_key,
						&root->root_item);
J
Josef Bacik 已提交
839
			spin_lock(&fs_info->fs_roots_radix_lock);
840 841
			if (err)
				break;
842 843
		}
	}
J
Josef Bacik 已提交
844
	spin_unlock(&fs_info->fs_roots_radix_lock);
845
	return err;
846 847
}

C
Chris Mason 已提交
848 849 850 851
/*
 * defrag a given btree.  If cacheonly == 1, this won't read from the disk,
 * otherwise every leaf in the btree is read and defragged.
 */
852 853 854 855
int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
{
	struct btrfs_fs_info *info = root->fs_info;
	struct btrfs_trans_handle *trans;
856
	int ret;
857
	unsigned long nr;
858

859
	if (xchg(&root->defrag_running, 1))
860
		return 0;
861

862
	while (1) {
863 864 865 866
		trans = btrfs_start_transaction(root, 0);
		if (IS_ERR(trans))
			return PTR_ERR(trans);

867
		ret = btrfs_defrag_leaves(trans, root, cacheonly);
868

869
		nr = trans->blocks_used;
870
		btrfs_end_transaction(trans, root);
871
		btrfs_btree_balance_dirty(info->tree_root, nr);
872 873
		cond_resched();

874
		if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
875 876 877
			break;
	}
	root->defrag_running = 0;
878
	return ret;
879 880
}

C
Chris Mason 已提交
881 882 883 884
/*
 * new snapshots need to be created at a very specific time in the
 * transaction commit.  This does the actual creation
 */
885
static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
886 887 888 889
				   struct btrfs_fs_info *fs_info,
				   struct btrfs_pending_snapshot *pending)
{
	struct btrfs_key key;
890
	struct btrfs_root_item *new_root_item;
891 892
	struct btrfs_root *tree_root = fs_info->tree_root;
	struct btrfs_root *root = pending->root;
893
	struct btrfs_root *parent_root;
L
Liu Bo 已提交
894
	struct btrfs_block_rsv *rsv;
895
	struct inode *parent_inode;
896
	struct dentry *parent;
897
	struct dentry *dentry;
898
	struct extent_buffer *tmp;
899
	struct extent_buffer *old;
900
	int ret;
901
	u64 to_reserve = 0;
902
	u64 index = 0;
903
	u64 objectid;
L
Li Zefan 已提交
904
	u64 root_flags;
905

L
Liu Bo 已提交
906 907
	rsv = trans->block_rsv;

908 909
	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
	if (!new_root_item) {
910
		ret = pending->error = -ENOMEM;
911 912
		goto fail;
	}
913

914
	ret = btrfs_find_free_objectid(tree_root, &objectid);
915 916
	if (ret) {
		pending->error = ret;
917
		goto fail;
918
	}
919

920
	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
921 922

	if (to_reserve > 0) {
923 924
		ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv,
						  to_reserve);
925 926 927 928 929 930
		if (ret) {
			pending->error = ret;
			goto fail;
		}
	}

931
	key.objectid = objectid;
932 933
	key.offset = (u64)-1;
	key.type = BTRFS_ROOT_ITEM_KEY;
934

935
	trans->block_rsv = &pending->block_rsv;
936

937
	dentry = pending->dentry;
938 939
	parent = dget_parent(dentry);
	parent_inode = parent->d_inode;
940
	parent_root = BTRFS_I(parent_inode)->root;
C
Chris Mason 已提交
941
	record_root_in_trans(trans, parent_root);
942

943 944 945
	/*
	 * insert the directory item
	 */
946
	ret = btrfs_set_inode_index(parent_inode, &index);
947
	BUG_ON(ret); /* -ENOMEM */
948
	ret = btrfs_insert_dir_item(trans, parent_root,
949
				dentry->d_name.name, dentry->d_name.len,
950
				parent_inode, &key,
951
				BTRFS_FT_DIR, index);
952
	if (ret == -EEXIST) {
953 954 955
		pending->error = -EEXIST;
		dput(parent);
		goto fail;
956 957 958
	} else if (ret) {
		goto abort_trans_dput;
	}
959

960 961
	btrfs_i_size_write(parent_inode, parent_inode->i_size +
					 dentry->d_name.len * 2);
962
	ret = btrfs_update_inode(trans, parent_root, parent_inode);
963
	if (ret)
964
		goto abort_trans_dput;
965

966 967 968 969 970 971 972
	/*
	 * pull in the delayed directory update
	 * and the delayed inode item
	 * otherwise we corrupt the FS during
	 * snapshot
	 */
	ret = btrfs_run_delayed_items(trans, root);
973 974
	if (ret) { /* Transaction aborted */
		dput(parent);
975
		goto fail;
976
	}
977

C
Chris Mason 已提交
978
	record_root_in_trans(trans, root);
979 980
	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
981
	btrfs_check_and_init_root_item(new_root_item);
982

L
Li Zefan 已提交
983 984 985 986 987 988 989
	root_flags = btrfs_root_flags(new_root_item);
	if (pending->readonly)
		root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
	else
		root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
	btrfs_set_root_flags(new_root_item, root_flags);

990
	old = btrfs_lock_root_node(root);
991
	ret = btrfs_cow_block(trans, root, old, NULL, 0, &old);
992 993 994 995 996
	if (ret) {
		btrfs_tree_unlock(old);
		free_extent_buffer(old);
		goto abort_trans_dput;
	}
997

998 999
	btrfs_set_lock_blocking(old);

1000
	ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
1001
	/* clean up in any case */
1002 1003
	btrfs_tree_unlock(old);
	free_extent_buffer(old);
1004 1005
	if (ret)
		goto abort_trans_dput;
1006

1007 1008 1009 1010
	/* see comments in should_cow_block() */
	root->force_cow = 1;
	smp_wmb();

1011
	btrfs_set_root_node(new_root_item, tmp);
1012 1013 1014
	/* record when the snapshot was created in key.offset */
	key.offset = trans->transid;
	ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
1015 1016
	btrfs_tree_unlock(tmp);
	free_extent_buffer(tmp);
1017
	if (ret)
1018
		goto abort_trans_dput;
1019

1020 1021 1022 1023
	/*
	 * insert root back/forward references
	 */
	ret = btrfs_add_root_ref(trans, tree_root, objectid,
1024
				 parent_root->root_key.objectid,
L
Li Zefan 已提交
1025
				 btrfs_ino(parent_inode), index,
1026
				 dentry->d_name.name, dentry->d_name.len);
1027
	dput(parent);
1028 1029
	if (ret)
		goto fail;
1030

1031 1032
	key.offset = (u64)-1;
	pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
1033 1034
	if (IS_ERR(pending->snap)) {
		ret = PTR_ERR(pending->snap);
1035
		goto abort_trans;
1036
	}
1037

1038 1039 1040 1041
	ret = btrfs_reloc_post_snapshot(trans, pending);
	if (ret)
		goto abort_trans;
	ret = 0;
1042
fail:
1043
	kfree(new_root_item);
L
Liu Bo 已提交
1044
	trans->block_rsv = rsv;
1045
	btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
1046 1047
	return ret;

1048 1049
abort_trans_dput:
	dput(parent);
1050 1051 1052
abort_trans:
	btrfs_abort_transaction(trans, root, ret);
	goto fail;
1053 1054
}

C
Chris Mason 已提交
1055 1056 1057
/*
 * create all the snapshots we've scheduled for creation
 */
1058 1059
static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
					     struct btrfs_fs_info *fs_info)
1060 1061 1062 1063
{
	struct btrfs_pending_snapshot *pending;
	struct list_head *head = &trans->transaction->pending_snapshots;

1064 1065
	list_for_each_entry(pending, head, list)
		create_pending_snapshot(trans, fs_info, pending);
1066 1067 1068
	return 0;
}

1069 1070 1071 1072 1073
static void update_super_roots(struct btrfs_root *root)
{
	struct btrfs_root_item *root_item;
	struct btrfs_super_block *super;

1074
	super = root->fs_info->super_copy;
1075 1076 1077 1078 1079 1080 1081 1082 1083 1084

	root_item = &root->fs_info->chunk_root->root_item;
	super->chunk_root = root_item->bytenr;
	super->chunk_root_generation = root_item->generation;
	super->chunk_root_level = root_item->level;

	root_item = &root->fs_info->tree_root->root_item;
	super->root = root_item->bytenr;
	super->generation = root_item->generation;
	super->root_level = root_item->level;
1085
	if (btrfs_test_opt(root, SPACE_CACHE))
1086
		super->cache_generation = root_item->generation;
1087 1088
}

1089 1090 1091
int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
{
	int ret = 0;
J
Josef Bacik 已提交
1092
	spin_lock(&info->trans_lock);
1093 1094
	if (info->running_transaction)
		ret = info->running_transaction->in_commit;
J
Josef Bacik 已提交
1095
	spin_unlock(&info->trans_lock);
1096 1097 1098
	return ret;
}

1099 1100 1101
int btrfs_transaction_blocked(struct btrfs_fs_info *info)
{
	int ret = 0;
J
Josef Bacik 已提交
1102
	spin_lock(&info->trans_lock);
1103 1104
	if (info->running_transaction)
		ret = info->running_transaction->blocked;
J
Josef Bacik 已提交
1105
	spin_unlock(&info->trans_lock);
1106 1107 1108
	return ret;
}

S
Sage Weil 已提交
1109 1110 1111 1112 1113 1114 1115
/*
 * wait for the current transaction commit to start and block subsequent
 * transaction joins
 */
static void wait_current_trans_commit_start(struct btrfs_root *root,
					    struct btrfs_transaction *trans)
{
L
Li Zefan 已提交
1116
	wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit);
S
Sage Weil 已提交
1117 1118 1119 1120 1121 1122 1123 1124 1125
}

/*
 * wait for the current transaction to start and then become unblocked.
 * caller holds ref.
 */
static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
					 struct btrfs_transaction *trans)
{
L
Li Zefan 已提交
1126 1127
	wait_event(root->fs_info->transaction_wait,
		   trans->commit_done || (trans->in_commit && !trans->blocked));
S
Sage Weil 已提交
1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156
}

/*
 * commit transactions asynchronously. once btrfs_commit_transaction_async
 * returns, any subsequent transaction will not be allowed to join.
 */
struct btrfs_async_commit {
	struct btrfs_trans_handle *newtrans;
	struct btrfs_root *root;
	struct delayed_work work;
};

static void do_async_commit(struct work_struct *work)
{
	struct btrfs_async_commit *ac =
		container_of(work, struct btrfs_async_commit, work.work);

	btrfs_commit_transaction(ac->newtrans, ac->root);
	kfree(ac);
}

int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root,
				   int wait_for_unblock)
{
	struct btrfs_async_commit *ac;
	struct btrfs_transaction *cur_trans;

	ac = kmalloc(sizeof(*ac), GFP_NOFS);
T
Tsutomu Itoh 已提交
1157 1158
	if (!ac)
		return -ENOMEM;
S
Sage Weil 已提交
1159 1160 1161

	INIT_DELAYED_WORK(&ac->work, do_async_commit);
	ac->root = root;
1162
	ac->newtrans = btrfs_join_transaction(root);
1163 1164 1165 1166 1167
	if (IS_ERR(ac->newtrans)) {
		int err = PTR_ERR(ac->newtrans);
		kfree(ac);
		return err;
	}
S
Sage Weil 已提交
1168 1169 1170

	/* take transaction reference */
	cur_trans = trans->transaction;
1171
	atomic_inc(&cur_trans->use_count);
S
Sage Weil 已提交
1172 1173 1174 1175 1176 1177 1178 1179 1180 1181

	btrfs_end_transaction(trans, root);
	schedule_delayed_work(&ac->work, 0);

	/* wait for transaction to start and unblock */
	if (wait_for_unblock)
		wait_current_trans_commit_start_and_unblock(root, cur_trans);
	else
		wait_current_trans_commit_start(root, cur_trans);

1182 1183 1184 1185
	if (current->journal_info == trans)
		current->journal_info = NULL;

	put_transaction(cur_trans);
S
Sage Weil 已提交
1186 1187 1188
	return 0;
}

1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215

static void cleanup_transaction(struct btrfs_trans_handle *trans,
				struct btrfs_root *root)
{
	struct btrfs_transaction *cur_trans = trans->transaction;

	WARN_ON(trans->use_count > 1);

	spin_lock(&root->fs_info->trans_lock);
	list_del_init(&cur_trans->list);
	spin_unlock(&root->fs_info->trans_lock);

	btrfs_cleanup_one_transaction(trans->transaction, root);

	put_transaction(cur_trans);
	put_transaction(cur_trans);

	trace_btrfs_transaction_commit(root);

	btrfs_scrub_continue(root);

	if (current->journal_info == trans)
		current->journal_info = NULL;

	kmem_cache_free(btrfs_trans_handle_cachep, trans);
}

S
Sage Weil 已提交
1216 1217 1218 1219 1220 1221 1222
/*
 * btrfs_transaction state sequence:
 *    in_commit = 0, blocked = 0  (initial)
 *    in_commit = 1, blocked = 1
 *    blocked = 0
 *    commit_done = 1
 */
C
Chris Mason 已提交
1223 1224 1225
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
			     struct btrfs_root *root)
{
1226
	unsigned long joined = 0;
1227
	struct btrfs_transaction *cur_trans = trans->transaction;
C
Chris Mason 已提交
1228
	struct btrfs_transaction *prev_trans = NULL;
C
Chris Mason 已提交
1229
	DEFINE_WAIT(wait);
1230
	int ret = -EIO;
1231 1232
	int should_grow = 0;
	unsigned long now = get_seconds();
1233
	int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
C
Chris Mason 已提交
1234

1235 1236
	btrfs_run_ordered_operations(root, 0);

1237
	btrfs_trans_release_metadata(trans, root);
1238 1239
	trans->block_rsv = NULL;

1240 1241 1242
	if (cur_trans->aborted)
		goto cleanup_transaction;

1243 1244 1245 1246
	/* make a pass through all the delayed refs we have so far
	 * any runnings procs may add more while we are here
	 */
	ret = btrfs_run_delayed_refs(trans, root, 0);
1247 1248
	if (ret)
		goto cleanup_transaction;
1249

1250
	cur_trans = trans->transaction;
1251

1252 1253 1254 1255
	/*
	 * set the flushing flag so procs in this transaction have to
	 * start sending their work down.
	 */
1256
	cur_trans->delayed_refs.flushing = 1;
1257

1258
	ret = btrfs_run_delayed_refs(trans, root, 0);
1259 1260
	if (ret)
		goto cleanup_transaction;
1261

J
Josef Bacik 已提交
1262
	spin_lock(&cur_trans->commit_lock);
1263
	if (cur_trans->in_commit) {
J
Josef Bacik 已提交
1264
		spin_unlock(&cur_trans->commit_lock);
1265
		atomic_inc(&cur_trans->use_count);
1266
		ret = btrfs_end_transaction(trans, root);
C
Chris Mason 已提交
1267

1268
		wait_for_commit(root, cur_trans);
1269

C
Chris Mason 已提交
1270
		put_transaction(cur_trans);
1271

1272
		return ret;
C
Chris Mason 已提交
1273
	}
1274

C
Chris Mason 已提交
1275
	trans->transaction->in_commit = 1;
1276
	trans->transaction->blocked = 1;
J
Josef Bacik 已提交
1277
	spin_unlock(&cur_trans->commit_lock);
S
Sage Weil 已提交
1278 1279
	wake_up(&root->fs_info->transaction_blocked_wait);

J
Josef Bacik 已提交
1280
	spin_lock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1281 1282 1283 1284
	if (cur_trans->list.prev != &root->fs_info->trans_list) {
		prev_trans = list_entry(cur_trans->list.prev,
					struct btrfs_transaction, list);
		if (!prev_trans->commit_done) {
1285
			atomic_inc(&prev_trans->use_count);
J
Josef Bacik 已提交
1286
			spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1287 1288 1289

			wait_for_commit(root, prev_trans);

1290
			put_transaction(prev_trans);
J
Josef Bacik 已提交
1291 1292
		} else {
			spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1293
		}
J
Josef Bacik 已提交
1294 1295
	} else {
		spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1296
	}
1297

1298 1299 1300
	if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
		should_grow = 1;

1301
	do {
1302
		int snap_pending = 0;
J
Josef Bacik 已提交
1303

1304
		joined = cur_trans->num_joined;
1305 1306 1307
		if (!list_empty(&trans->transaction->pending_snapshots))
			snap_pending = 1;

C
Chris Mason 已提交
1308
		WARN_ON(cur_trans != trans->transaction);
1309

1310
		if (flush_on_commit || snap_pending) {
Y
Yan, Zheng 已提交
1311
			btrfs_start_delalloc_inodes(root, 1);
1312
			btrfs_wait_ordered_extents(root, 0, 1);
1313 1314
		}

1315
		ret = btrfs_run_delayed_items(trans, root);
1316 1317
		if (ret)
			goto cleanup_transaction;
1318

1319 1320 1321 1322 1323 1324 1325 1326 1327
		/*
		 * rename don't use btrfs_join_transaction, so, once we
		 * set the transaction to blocked above, we aren't going
		 * to get any new ordered operations.  We can safely run
		 * it here and no for sure that nothing new will be added
		 * to the list
		 */
		btrfs_run_ordered_operations(root, 1);

1328 1329 1330
		prepare_to_wait(&cur_trans->writer_wait, &wait,
				TASK_UNINTERRUPTIBLE);

1331
		if (atomic_read(&cur_trans->num_writers) > 1)
1332 1333 1334
			schedule_timeout(MAX_SCHEDULE_TIMEOUT);
		else if (should_grow)
			schedule_timeout(1);
1335 1336

		finish_wait(&cur_trans->writer_wait, &wait);
1337
	} while (atomic_read(&cur_trans->num_writers) > 1 ||
1338
		 (should_grow && cur_trans->num_joined != joined));
1339

1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350
	/*
	 * Ok now we need to make sure to block out any other joins while we
	 * commit the transaction.  We could have started a join before setting
	 * no_join so make sure to wait for num_writers to == 1 again.
	 */
	spin_lock(&root->fs_info->trans_lock);
	root->fs_info->trans_no_join = 1;
	spin_unlock(&root->fs_info->trans_lock);
	wait_event(cur_trans->writer_wait,
		   atomic_read(&cur_trans->num_writers) == 1);

C
Chris Mason 已提交
1351 1352 1353 1354 1355 1356 1357
	/*
	 * the reloc mutex makes sure that we stop
	 * the balancing code from coming in and moving
	 * extents around in the middle of the commit
	 */
	mutex_lock(&root->fs_info->reloc_mutex);

1358
	ret = btrfs_run_delayed_items(trans, root);
1359 1360 1361 1362
	if (ret) {
		mutex_unlock(&root->fs_info->reloc_mutex);
		goto cleanup_transaction;
	}
1363

1364
	ret = create_pending_snapshots(trans, root->fs_info);
1365 1366 1367 1368
	if (ret) {
		mutex_unlock(&root->fs_info->reloc_mutex);
		goto cleanup_transaction;
	}
1369

1370
	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1371 1372 1373 1374
	if (ret) {
		mutex_unlock(&root->fs_info->reloc_mutex);
		goto cleanup_transaction;
	}
1375

1376 1377 1378 1379 1380 1381
	/*
	 * make sure none of the code above managed to slip in a
	 * delayed item
	 */
	btrfs_assert_delayed_root_empty(root);

C
Chris Mason 已提交
1382
	WARN_ON(cur_trans != trans->transaction);
C
Chris Mason 已提交
1383

A
Arne Jansen 已提交
1384
	btrfs_scrub_pause(root);
1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399
	/* btrfs_commit_tree_roots is responsible for getting the
	 * various roots consistent with each other.  Every pointer
	 * in the tree of tree roots has to point to the most up to date
	 * root for every subvolume and other tree.  So, we have to keep
	 * the tree logging code from jumping in and changing any
	 * of the trees.
	 *
	 * At this point in the commit, there can't be any tree-log
	 * writers, but a little lower down we drop the trans mutex
	 * and let new people in.  By holding the tree_log_mutex
	 * from now until after the super is written, we avoid races
	 * with the tree-log code.
	 */
	mutex_lock(&root->fs_info->tree_log_mutex);

1400
	ret = commit_fs_roots(trans, root);
1401 1402 1403 1404
	if (ret) {
		mutex_unlock(&root->fs_info->tree_log_mutex);
		goto cleanup_transaction;
	}
1405

1406
	/* commit_fs_roots gets rid of all the tree log roots, it is now
1407 1408 1409 1410
	 * safe to free the root of tree log roots
	 */
	btrfs_free_log_root_tree(trans, root->fs_info);

1411
	ret = commit_cowonly_roots(trans, root);
1412 1413 1414 1415
	if (ret) {
		mutex_unlock(&root->fs_info->tree_log_mutex);
		goto cleanup_transaction;
	}
1416

1417 1418
	btrfs_prepare_extent_commit(trans, root);

C
Chris Mason 已提交
1419
	cur_trans = root->fs_info->running_transaction;
1420 1421 1422

	btrfs_set_root_node(&root->fs_info->tree_root->root_item,
			    root->fs_info->tree_root->node);
J
Josef Bacik 已提交
1423
	switch_commit_root(root->fs_info->tree_root);
1424 1425 1426

	btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
			    root->fs_info->chunk_root->node);
J
Josef Bacik 已提交
1427
	switch_commit_root(root->fs_info->chunk_root);
1428 1429

	update_super_roots(root);
1430 1431

	if (!root->fs_info->log_root_recovering) {
1432 1433
		btrfs_set_super_log_root(root->fs_info->super_copy, 0);
		btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
1434 1435
	}

1436 1437
	memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
	       sizeof(*root->fs_info->super_copy));
C
Chris Mason 已提交
1438

1439
	trans->transaction->blocked = 0;
J
Josef Bacik 已提交
1440 1441 1442 1443
	spin_lock(&root->fs_info->trans_lock);
	root->fs_info->running_transaction = NULL;
	root->fs_info->trans_no_join = 0;
	spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1444
	mutex_unlock(&root->fs_info->reloc_mutex);
1445

1446
	wake_up(&root->fs_info->transaction_wait);
1447

C
Chris Mason 已提交
1448
	ret = btrfs_write_and_wait_transaction(trans, root);
1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460
	if (ret) {
		btrfs_error(root->fs_info, ret,
			    "Error while writing out transaction.");
		mutex_unlock(&root->fs_info->tree_log_mutex);
		goto cleanup_transaction;
	}

	ret = write_ctree_super(trans, root, 0);
	if (ret) {
		mutex_unlock(&root->fs_info->tree_log_mutex);
		goto cleanup_transaction;
	}
1461

1462 1463 1464 1465 1466 1467
	/*
	 * the super is written, we can safely allow the tree-loggers
	 * to go about their business
	 */
	mutex_unlock(&root->fs_info->tree_log_mutex);

1468
	btrfs_finish_extent_commit(trans, root);
1469

C
Chris Mason 已提交
1470
	cur_trans->commit_done = 1;
1471

1472
	root->fs_info->last_trans_committed = cur_trans->transid;
J
Josef Bacik 已提交
1473

C
Chris Mason 已提交
1474
	wake_up(&cur_trans->commit_wait);
1475

J
Josef Bacik 已提交
1476
	spin_lock(&root->fs_info->trans_lock);
1477
	list_del_init(&cur_trans->list);
J
Josef Bacik 已提交
1478 1479
	spin_unlock(&root->fs_info->trans_lock);

C
Chris Mason 已提交
1480
	put_transaction(cur_trans);
C
Chris Mason 已提交
1481
	put_transaction(cur_trans);
1482

1483 1484
	trace_btrfs_transaction_commit(root);

A
Arne Jansen 已提交
1485 1486
	btrfs_scrub_continue(root);

J
Josef Bacik 已提交
1487 1488 1489
	if (current->journal_info == trans)
		current->journal_info = NULL;

C
Chris Mason 已提交
1490
	kmem_cache_free(btrfs_trans_handle_cachep, trans);
Y
Yan, Zheng 已提交
1491 1492 1493 1494

	if (current != root->fs_info->transaction_kthread)
		btrfs_run_delayed_iputs(root);

C
Chris Mason 已提交
1495
	return ret;
1496 1497 1498 1499 1500 1501 1502 1503 1504

cleanup_transaction:
	btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n");
//	WARN_ON(1);
	if (current->journal_info == trans)
		current->journal_info = NULL;
	cleanup_transaction(trans, root);

	return ret;
C
Chris Mason 已提交
1505 1506
}

C
Chris Mason 已提交
1507 1508 1509
/*
 * interface function to delete all the snapshots we have scheduled for deletion
 */
1510 1511
int btrfs_clean_old_snapshots(struct btrfs_root *root)
{
1512 1513 1514
	LIST_HEAD(list);
	struct btrfs_fs_info *fs_info = root->fs_info;

J
Josef Bacik 已提交
1515
	spin_lock(&fs_info->trans_lock);
1516
	list_splice_init(&fs_info->dead_roots, &list);
J
Josef Bacik 已提交
1517
	spin_unlock(&fs_info->trans_lock);
1518

1519
	while (!list_empty(&list)) {
1520 1521
		int ret;

1522
		root = list_entry(list.next, struct btrfs_root, root_list);
1523 1524
		list_del(&root->root_list);

1525 1526
		btrfs_kill_all_delayed_nodes(root);

1527 1528
		if (btrfs_header_backref_rev(root->node) <
		    BTRFS_MIXED_BACKREF_REV)
1529
			ret = btrfs_drop_snapshot(root, NULL, 0, 0);
1530
		else
1531 1532
			ret =btrfs_drop_snapshot(root, NULL, 1, 0);
		BUG_ON(ret < 0);
1533 1534 1535
	}
	return 0;
}