transaction.c 39.2 KB
Newer Older
C
Chris Mason 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

C
Chris Mason 已提交
19
#include <linux/fs.h>
20
#include <linux/slab.h>
C
Chris Mason 已提交
21
#include <linux/sched.h>
22
#include <linux/writeback.h>
23
#include <linux/pagemap.h>
24
#include <linux/blkdev.h>
C
Chris Mason 已提交
25 26 27
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
28
#include "locking.h"
29
#include "tree-log.h"
30
#include "inode-map.h"
C
Chris Mason 已提交
31

32 33
#define BTRFS_ROOT_TRANS_TAG 0

34
void put_transaction(struct btrfs_transaction *transaction)
C
Chris Mason 已提交
35
{
36 37
	WARN_ON(atomic_read(&transaction->use_count) == 0);
	if (atomic_dec_and_test(&transaction->use_count)) {
J
Josef Bacik 已提交
38
		BUG_ON(!list_empty(&transaction->list));
39 40
		WARN_ON(transaction->delayed_refs.root.rb_node);
		WARN_ON(!list_empty(&transaction->delayed_refs.seq_head));
C
Chris Mason 已提交
41 42
		memset(transaction, 0, sizeof(*transaction));
		kmem_cache_free(btrfs_transaction_cachep, transaction);
C
Chris Mason 已提交
43
	}
C
Chris Mason 已提交
44 45
}

J
Josef Bacik 已提交
46 47 48 49 50 51
static noinline void switch_commit_root(struct btrfs_root *root)
{
	free_extent_buffer(root->commit_root);
	root->commit_root = btrfs_root_node(root);
}

C
Chris Mason 已提交
52 53 54
/*
 * either allocate a new transaction or hop into the existing one
 */
J
Josef Bacik 已提交
55
static noinline int join_transaction(struct btrfs_root *root, int nofail)
C
Chris Mason 已提交
56 57
{
	struct btrfs_transaction *cur_trans;
J
Josef Bacik 已提交
58 59

	spin_lock(&root->fs_info->trans_lock);
60
loop:
61 62 63 64 65 66
	/* The file system has been taken offline. No new transactions. */
	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
		spin_unlock(&root->fs_info->trans_lock);
		return -EROFS;
	}

J
Josef Bacik 已提交
67 68 69 70 71 72 73
	if (root->fs_info->trans_no_join) {
		if (!nofail) {
			spin_unlock(&root->fs_info->trans_lock);
			return -EBUSY;
		}
	}

C
Chris Mason 已提交
74
	cur_trans = root->fs_info->running_transaction;
J
Josef Bacik 已提交
75
	if (cur_trans) {
76 77
		if (cur_trans->aborted)
			return cur_trans->aborted;
J
Josef Bacik 已提交
78
		atomic_inc(&cur_trans->use_count);
79
		atomic_inc(&cur_trans->num_writers);
80
		cur_trans->num_joined++;
J
Josef Bacik 已提交
81 82
		spin_unlock(&root->fs_info->trans_lock);
		return 0;
C
Chris Mason 已提交
83
	}
J
Josef Bacik 已提交
84 85 86 87 88
	spin_unlock(&root->fs_info->trans_lock);

	cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
	if (!cur_trans)
		return -ENOMEM;
89

J
Josef Bacik 已提交
90 91
	spin_lock(&root->fs_info->trans_lock);
	if (root->fs_info->running_transaction) {
92 93 94 95
		/*
		 * someone started a transaction after we unlocked.  Make sure
		 * to redo the trans_no_join checks above
		 */
J
Josef Bacik 已提交
96 97
		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
		cur_trans = root->fs_info->running_transaction;
98
		goto loop;
C
Chris Mason 已提交
99
	}
100

J
Josef Bacik 已提交
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
	atomic_set(&cur_trans->num_writers, 1);
	cur_trans->num_joined = 0;
	init_waitqueue_head(&cur_trans->writer_wait);
	init_waitqueue_head(&cur_trans->commit_wait);
	cur_trans->in_commit = 0;
	cur_trans->blocked = 0;
	/*
	 * One for this trans handle, one so it will live on until we
	 * commit the transaction.
	 */
	atomic_set(&cur_trans->use_count, 2);
	cur_trans->commit_done = 0;
	cur_trans->start_time = get_seconds();

	cur_trans->delayed_refs.root = RB_ROOT;
	cur_trans->delayed_refs.num_entries = 0;
	cur_trans->delayed_refs.num_heads_ready = 0;
	cur_trans->delayed_refs.num_heads = 0;
	cur_trans->delayed_refs.flushing = 0;
	cur_trans->delayed_refs.run_delayed_start = 0;
121
	cur_trans->delayed_refs.seq = 1;
122
	init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
J
Josef Bacik 已提交
123 124
	spin_lock_init(&cur_trans->commit_lock);
	spin_lock_init(&cur_trans->delayed_refs.lock);
125
	INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
J
Josef Bacik 已提交
126 127 128 129

	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
	list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
	extent_io_tree_init(&cur_trans->dirty_pages,
C
Chris Mason 已提交
130
			     root->fs_info->btree_inode->i_mapping);
J
Josef Bacik 已提交
131 132 133
	root->fs_info->generation++;
	cur_trans->transid = root->fs_info->generation;
	root->fs_info->running_transaction = cur_trans;
134
	cur_trans->aborted = 0;
J
Josef Bacik 已提交
135
	spin_unlock(&root->fs_info->trans_lock);
136

C
Chris Mason 已提交
137 138 139
	return 0;
}

C
Chris Mason 已提交
140
/*
C
Chris Mason 已提交
141 142 143 144
 * this does all the record keeping required to make sure that a reference
 * counted root is properly recorded in a given transaction.  This is required
 * to make sure the old root from before we joined the transaction is deleted
 * when the transaction commits
C
Chris Mason 已提交
145
 */
C
Chris Mason 已提交
146
static int record_root_in_trans(struct btrfs_trans_handle *trans,
J
Josef Bacik 已提交
147
			       struct btrfs_root *root)
148
{
149
	if (root->ref_cows && root->last_trans < trans->transid) {
150
		WARN_ON(root == root->fs_info->extent_root);
151 152
		WARN_ON(root->commit_root != root->node);

C
Chris Mason 已提交
153 154 155 156 157 158 159 160 161 162 163 164
		/*
		 * see below for in_trans_setup usage rules
		 * we have the reloc mutex held now, so there
		 * is only one writer in this function
		 */
		root->in_trans_setup = 1;

		/* make sure readers find in_trans_setup before
		 * they find our root->last_trans update
		 */
		smp_wmb();

J
Josef Bacik 已提交
165 166 167 168 169
		spin_lock(&root->fs_info->fs_roots_radix_lock);
		if (root->last_trans == trans->transid) {
			spin_unlock(&root->fs_info->fs_roots_radix_lock);
			return 0;
		}
170 171 172
		radix_tree_tag_set(&root->fs_info->fs_roots_radix,
			   (unsigned long)root->root_key.objectid,
			   BTRFS_ROOT_TRANS_TAG);
J
Josef Bacik 已提交
173
		spin_unlock(&root->fs_info->fs_roots_radix_lock);
C
Chris Mason 已提交
174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194
		root->last_trans = trans->transid;

		/* this is pretty tricky.  We don't want to
		 * take the relocation lock in btrfs_record_root_in_trans
		 * unless we're really doing the first setup for this root in
		 * this transaction.
		 *
		 * Normally we'd use root->last_trans as a flag to decide
		 * if we want to take the expensive mutex.
		 *
		 * But, we have to set root->last_trans before we
		 * init the relocation root, otherwise, we trip over warnings
		 * in ctree.c.  The solution used here is to flag ourselves
		 * with root->in_trans_setup.  When this is 1, we're still
		 * fixing up the reloc trees and everyone must wait.
		 *
		 * When this is zero, they can trust root->last_trans and fly
		 * through btrfs_record_root_in_trans without having to take the
		 * lock.  smp_wmb() makes sure that all the writes above are
		 * done before we pop in the zero below
		 */
195
		btrfs_init_reloc_root(trans, root);
C
Chris Mason 已提交
196 197
		smp_wmb();
		root->in_trans_setup = 0;
198 199 200
	}
	return 0;
}
201

C
Chris Mason 已提交
202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224

int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root)
{
	if (!root->ref_cows)
		return 0;

	/*
	 * see record_root_in_trans for comments about in_trans_setup usage
	 * and barriers
	 */
	smp_rmb();
	if (root->last_trans == trans->transid &&
	    !root->in_trans_setup)
		return 0;

	mutex_lock(&root->fs_info->reloc_mutex);
	record_root_in_trans(trans, root);
	mutex_unlock(&root->fs_info->reloc_mutex);

	return 0;
}

C
Chris Mason 已提交
225 226 227 228
/* wait for commit against the current transaction to become unblocked
 * when this is done, it is safe to start a new transaction, but the current
 * transaction might not be fully on disk.
 */
C
Chris Mason 已提交
229
static void wait_current_trans(struct btrfs_root *root)
C
Chris Mason 已提交
230
{
231
	struct btrfs_transaction *cur_trans;
C
Chris Mason 已提交
232

J
Josef Bacik 已提交
233
	spin_lock(&root->fs_info->trans_lock);
234
	cur_trans = root->fs_info->running_transaction;
C
Chris Mason 已提交
235
	if (cur_trans && cur_trans->blocked) {
236
		atomic_inc(&cur_trans->use_count);
J
Josef Bacik 已提交
237
		spin_unlock(&root->fs_info->trans_lock);
L
Li Zefan 已提交
238 239 240

		wait_event(root->fs_info->transaction_wait,
			   !cur_trans->blocked);
241
		put_transaction(cur_trans);
J
Josef Bacik 已提交
242 243
	} else {
		spin_unlock(&root->fs_info->trans_lock);
244
	}
C
Chris Mason 已提交
245 246
}

247 248 249 250
enum btrfs_trans_type {
	TRANS_START,
	TRANS_JOIN,
	TRANS_USERSPACE,
251
	TRANS_JOIN_NOLOCK,
252 253
};

254 255
static int may_wait_transaction(struct btrfs_root *root, int type)
{
J
Josef Bacik 已提交
256 257 258 259 260 261 262 263
	if (root->fs_info->log_root_recovering)
		return 0;

	if (type == TRANS_USERSPACE)
		return 1;

	if (type == TRANS_START &&
	    !atomic_read(&root->fs_info->open_ioctl_trans))
264
		return 1;
J
Josef Bacik 已提交
265

266 267 268
	return 0;
}

269
static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
270
						    u64 num_items, int type)
C
Chris Mason 已提交
271
{
272 273
	struct btrfs_trans_handle *h;
	struct btrfs_transaction *cur_trans;
274
	u64 num_bytes = 0;
C
Chris Mason 已提交
275
	int ret;
L
liubo 已提交
276 277 278

	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
		return ERR_PTR(-EROFS);
279 280 281 282 283 284 285 286 287

	if (current->journal_info) {
		WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
		h = current->journal_info;
		h->use_count++;
		h->orig_rsv = h->block_rsv;
		h->block_rsv = NULL;
		goto got_it;
	}
288 289 290 291 292 293 294

	/*
	 * Do the reservation before we join the transaction so we can do all
	 * the appropriate flushing if need be.
	 */
	if (num_items > 0 && root != root->fs_info->chunk_root) {
		num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
295
		ret = btrfs_block_rsv_add(root,
296 297 298 299 300
					  &root->fs_info->trans_block_rsv,
					  num_bytes);
		if (ret)
			return ERR_PTR(ret);
	}
301 302 303 304
again:
	h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
	if (!h)
		return ERR_PTR(-ENOMEM);
C
Chris Mason 已提交
305

306
	if (may_wait_transaction(root, type))
C
Chris Mason 已提交
307
		wait_current_trans(root);
308

J
Josef Bacik 已提交
309 310 311 312 313 314
	do {
		ret = join_transaction(root, type == TRANS_JOIN_NOLOCK);
		if (ret == -EBUSY)
			wait_current_trans(root);
	} while (ret == -EBUSY);

T
Tsutomu Itoh 已提交
315
	if (ret < 0) {
316
		kmem_cache_free(btrfs_trans_handle_cachep, h);
T
Tsutomu Itoh 已提交
317 318
		return ERR_PTR(ret);
	}
319

320 321 322 323
	cur_trans = root->fs_info->running_transaction;

	h->transid = cur_trans->transid;
	h->transaction = cur_trans;
C
Chris Mason 已提交
324
	h->blocks_used = 0;
325
	h->bytes_reserved = 0;
326
	h->delayed_ref_updates = 0;
327
	h->use_count = 1;
328
	h->block_rsv = NULL;
329
	h->orig_rsv = NULL;
330
	h->aborted = 0;
331

332 333 334 335 336 337
	smp_mb();
	if (cur_trans->blocked && may_wait_transaction(root, type)) {
		btrfs_commit_transaction(h, root);
		goto again;
	}

338
	if (num_bytes) {
J
Josef Bacik 已提交
339
		trace_btrfs_space_reservation(root->fs_info, "transaction",
340 341
					      (u64)(unsigned long)h,
					      num_bytes, 1);
342 343
		h->block_rsv = &root->fs_info->trans_block_rsv;
		h->bytes_reserved = num_bytes;
344
	}
J
Josef Bacik 已提交
345

346
got_it:
J
Josef Bacik 已提交
347
	btrfs_record_root_in_trans(h, root);
348 349 350

	if (!current->journal_info && type != TRANS_USERSPACE)
		current->journal_info = h;
C
Chris Mason 已提交
351 352 353
	return h;
}

354
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
355
						   int num_items)
356
{
357
	return start_transaction(root, num_items, TRANS_START);
358
}
359
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
360
{
361
	return start_transaction(root, 0, TRANS_JOIN);
362 363
}

364
struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
365 366 367 368
{
	return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
}

369
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
370
{
371
	return start_transaction(root, 0, TRANS_USERSPACE);
372 373
}

C
Chris Mason 已提交
374
/* wait for a transaction commit to be fully complete */
375
static noinline void wait_for_commit(struct btrfs_root *root,
376 377
				    struct btrfs_transaction *commit)
{
L
Li Zefan 已提交
378
	wait_event(commit->commit_wait, commit->commit_done);
379 380
}

381 382 383 384 385 386 387 388
int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
{
	struct btrfs_transaction *cur_trans = NULL, *t;
	int ret;

	ret = 0;
	if (transid) {
		if (transid <= root->fs_info->last_trans_committed)
J
Josef Bacik 已提交
389
			goto out;
390 391

		/* find specified transaction */
J
Josef Bacik 已提交
392
		spin_lock(&root->fs_info->trans_lock);
393 394 395
		list_for_each_entry(t, &root->fs_info->trans_list, list) {
			if (t->transid == transid) {
				cur_trans = t;
J
Josef Bacik 已提交
396
				atomic_inc(&cur_trans->use_count);
397 398 399 400 401
				break;
			}
			if (t->transid > transid)
				break;
		}
J
Josef Bacik 已提交
402
		spin_unlock(&root->fs_info->trans_lock);
403 404
		ret = -EINVAL;
		if (!cur_trans)
J
Josef Bacik 已提交
405
			goto out;  /* bad transid */
406 407
	} else {
		/* find newest transaction that is committing | committed */
J
Josef Bacik 已提交
408
		spin_lock(&root->fs_info->trans_lock);
409 410 411 412
		list_for_each_entry_reverse(t, &root->fs_info->trans_list,
					    list) {
			if (t->in_commit) {
				if (t->commit_done)
413
					break;
414
				cur_trans = t;
J
Josef Bacik 已提交
415
				atomic_inc(&cur_trans->use_count);
416 417 418
				break;
			}
		}
J
Josef Bacik 已提交
419
		spin_unlock(&root->fs_info->trans_lock);
420
		if (!cur_trans)
J
Josef Bacik 已提交
421
			goto out;  /* nothing committing|committed */
422 423 424 425 426 427
	}

	wait_for_commit(root, cur_trans);

	put_transaction(cur_trans);
	ret = 0;
J
Josef Bacik 已提交
428
out:
429 430 431
	return ret;
}

C
Chris Mason 已提交
432 433
void btrfs_throttle(struct btrfs_root *root)
{
J
Josef Bacik 已提交
434
	if (!atomic_read(&root->fs_info->open_ioctl_trans))
435
		wait_current_trans(root);
C
Chris Mason 已提交
436 437
}

438 439 440 441
static int should_end_transaction(struct btrfs_trans_handle *trans,
				  struct btrfs_root *root)
{
	int ret;
442 443

	ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
444 445 446 447 448 449 450
	return ret ? 1 : 0;
}

int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root)
{
	struct btrfs_transaction *cur_trans = trans->transaction;
451
	struct btrfs_block_rsv *rsv = trans->block_rsv;
452
	int updates;
453
	int err;
454

J
Josef Bacik 已提交
455
	smp_mb();
456 457 458
	if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
		return 1;

459 460 461 462 463 464
	/*
	 * We need to do this in case we're deleting csums so the global block
	 * rsv get's used instead of the csum block rsv.
	 */
	trans->block_rsv = NULL;

465 466
	updates = trans->delayed_ref_updates;
	trans->delayed_ref_updates = 0;
467 468 469 470 471
	if (updates) {
		err = btrfs_run_delayed_refs(trans, root, updates);
		if (err) /* Error code will also eval true */
			return err;
	}
472

473 474
	trans->block_rsv = rsv;

475 476 477
	return should_end_transaction(trans, root);
}

478
static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
479
			  struct btrfs_root *root, int throttle, int lock)
C
Chris Mason 已提交
480
{
481
	struct btrfs_transaction *cur_trans = trans->transaction;
482
	struct btrfs_fs_info *info = root->fs_info;
483 484
	int count = 0;

485 486 487 488 489
	if (--trans->use_count) {
		trans->block_rsv = trans->orig_rsv;
		return 0;
	}

490
	btrfs_trans_release_metadata(trans, root);
491
	trans->block_rsv = NULL;
492
	while (count < 2) {
493 494 495 496 497 498 499 500 501 502
		unsigned long cur = trans->delayed_ref_updates;
		trans->delayed_ref_updates = 0;
		if (cur &&
		    trans->transaction->delayed_refs.num_heads_ready > 64) {
			trans->delayed_ref_updates = 0;
			btrfs_run_delayed_refs(trans, root, cur);
		} else {
			break;
		}
		count++;
503 504
	}

J
Josef Bacik 已提交
505 506
	if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
	    should_end_transaction(trans, root)) {
507
		trans->transaction->blocked = 1;
J
Josef Bacik 已提交
508 509
		smp_wmb();
	}
510

511
	if (lock && cur_trans->blocked && !cur_trans->in_commit) {
512 513 514 515 516 517 518
		if (throttle) {
			/*
			 * We may race with somebody else here so end up having
			 * to call end_transaction on ourselves again, so inc
			 * our use_count.
			 */
			trans->use_count++;
519
			return btrfs_commit_transaction(trans, root);
520
		} else {
521
			wake_up_process(info->transaction_kthread);
522
		}
523 524 525
	}

	WARN_ON(cur_trans != info->running_transaction);
526 527
	WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
	atomic_dec(&cur_trans->num_writers);
528

529
	smp_mb();
C
Chris Mason 已提交
530 531 532
	if (waitqueue_active(&cur_trans->writer_wait))
		wake_up(&cur_trans->writer_wait);
	put_transaction(cur_trans);
J
Josef Bacik 已提交
533 534 535

	if (current->journal_info == trans)
		current->journal_info = NULL;
C
Chris Mason 已提交
536
	memset(trans, 0, sizeof(*trans));
C
Chris Mason 已提交
537
	kmem_cache_free(btrfs_trans_handle_cachep, trans);
538

Y
Yan, Zheng 已提交
539 540 541
	if (throttle)
		btrfs_run_delayed_iputs(root);

542 543 544 545 546
	if (trans->aborted ||
	    root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
		return -EIO;
	}

C
Chris Mason 已提交
547 548 549
	return 0;
}

550 551 552
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root)
{
553 554 555 556 557 558
	int ret;

	ret = __btrfs_end_transaction(trans, root, 0, 1);
	if (ret)
		return ret;
	return 0;
559 560 561 562 563
}

int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root)
{
564 565 566 567 568 569
	int ret;

	ret = __btrfs_end_transaction(trans, root, 1, 1);
	if (ret)
		return ret;
	return 0;
570 571 572 573 574
}

int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root)
{
575 576 577 578 579 580 581 582 583 584 585 586
	int ret;

	ret = __btrfs_end_transaction(trans, root, 0, 0);
	if (ret)
		return ret;
	return 0;
}

int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
				struct btrfs_root *root)
{
	return __btrfs_end_transaction(trans, root, 1, 1);
587 588
}

C
Chris Mason 已提交
589 590 591
/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
592
 * those extents are sent to disk but does not wait on them
C
Chris Mason 已提交
593
 */
594
int btrfs_write_marked_extents(struct btrfs_root *root,
595
			       struct extent_io_tree *dirty_pages, int mark)
C
Chris Mason 已提交
596
{
597
	int err = 0;
598
	int werr = 0;
J
Josef Bacik 已提交
599
	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
600
	u64 start = 0;
601
	u64 end;
602

J
Josef Bacik 已提交
603 604 605 606 607 608 609 610 611
	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
				      mark)) {
		convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
				   GFP_NOFS);
		err = filemap_fdatawrite_range(mapping, start, end);
		if (err)
			werr = err;
		cond_resched();
		start = end + 1;
612
	}
613 614 615 616 617 618 619 620 621 622 623 624
	if (err)
		werr = err;
	return werr;
}

/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
 * those extents are on disk for transaction or log commit.  We wait
 * on all the pages and clear them from the dirty pages state tree
 */
int btrfs_wait_marked_extents(struct btrfs_root *root,
625
			      struct extent_io_tree *dirty_pages, int mark)
626 627 628
{
	int err = 0;
	int werr = 0;
J
Josef Bacik 已提交
629
	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
630 631
	u64 start = 0;
	u64 end;
632

J
Josef Bacik 已提交
633 634 635 636 637 638 639 640
	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
				      EXTENT_NEED_WAIT)) {
		clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
		err = filemap_fdatawait_range(mapping, start, end);
		if (err)
			werr = err;
		cond_resched();
		start = end + 1;
641
	}
642 643 644
	if (err)
		werr = err;
	return werr;
C
Chris Mason 已提交
645 646
}

647 648 649 650 651 652
/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
 * those extents are on disk for transaction or log commit
 */
int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
653
				struct extent_io_tree *dirty_pages, int mark)
654 655 656 657
{
	int ret;
	int ret2;

658 659
	ret = btrfs_write_marked_extents(root, dirty_pages, mark);
	ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
660 661 662 663 664 665

	if (ret)
		return ret;
	if (ret2)
		return ret2;
	return 0;
666 667
}

668 669 670 671 672 673 674 675 676
int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
				     struct btrfs_root *root)
{
	if (!trans || !trans->transaction) {
		struct inode *btree_inode;
		btree_inode = root->fs_info->btree_inode;
		return filemap_write_and_wait(btree_inode->i_mapping);
	}
	return btrfs_write_and_wait_marked_extents(root,
677 678
					   &trans->transaction->dirty_pages,
					   EXTENT_DIRTY);
679 680
}

C
Chris Mason 已提交
681 682 683 684 685 686 687 688 689 690
/*
 * this is used to update the root pointer in the tree of tree roots.
 *
 * But, in the case of the extent allocation tree, updating the root
 * pointer may allocate blocks which may change the root of the extent
 * allocation tree.
 *
 * So, this loops and repeats and makes sure the cowonly root didn't
 * change while the root pointer was being updated in the metadata.
 */
691 692
static int update_cowonly_root(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root)
C
Chris Mason 已提交
693 694
{
	int ret;
695
	u64 old_root_bytenr;
696
	u64 old_root_used;
697
	struct btrfs_root *tree_root = root->fs_info->tree_root;
C
Chris Mason 已提交
698

699
	old_root_used = btrfs_root_used(&root->root_item);
700
	btrfs_write_dirty_block_groups(trans, root);
701

C
Chris Mason 已提交
702
	while (1) {
703
		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
704 705
		if (old_root_bytenr == root->node->start &&
		    old_root_used == btrfs_root_used(&root->root_item))
C
Chris Mason 已提交
706
			break;
707

708
		btrfs_set_root_node(&root->root_item, root->node);
C
Chris Mason 已提交
709
		ret = btrfs_update_root(trans, tree_root,
710 711
					&root->root_key,
					&root->root_item);
712 713
		if (ret)
			return ret;
714

715
		old_root_used = btrfs_root_used(&root->root_item);
716
		ret = btrfs_write_dirty_block_groups(trans, root);
717 718
		if (ret)
			return ret;
719
	}
720 721 722 723

	if (root != root->fs_info->extent_root)
		switch_commit_root(root);

724 725 726
	return 0;
}

C
Chris Mason 已提交
727 728
/*
 * update all the cowonly tree roots on disk
729 730 731 732
 *
 * The error handling in this function may not be obvious. Any of the
 * failures will cause the file system to go offline. We still need
 * to clean up the delayed refs.
C
Chris Mason 已提交
733
 */
734 735
static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
					 struct btrfs_root *root)
736 737 738
{
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct list_head *next;
739
	struct extent_buffer *eb;
740
	int ret;
741

742
	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
743 744
	if (ret)
		return ret;
745

746
	eb = btrfs_lock_root_node(fs_info->tree_root);
747 748
	ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
			      0, &eb);
749 750
	btrfs_tree_unlock(eb);
	free_extent_buffer(eb);
751

752 753 754
	if (ret)
		return ret;

755
	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
756 757
	if (ret)
		return ret;
758

C
Chris Mason 已提交
759
	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
760 761 762
		next = fs_info->dirty_cowonly_roots.next;
		list_del_init(next);
		root = list_entry(next, struct btrfs_root, dirty_list);
763

764 765 766
		ret = update_cowonly_root(trans, root);
		if (ret)
			return ret;
C
Chris Mason 已提交
767
	}
768 769 770 771 772

	down_write(&fs_info->extent_commit_sem);
	switch_commit_root(fs_info->extent_root);
	up_write(&fs_info->extent_commit_sem);

C
Chris Mason 已提交
773 774 775
	return 0;
}

C
Chris Mason 已提交
776 777 778 779 780
/*
 * dead roots are old snapshots that need to be deleted.  This allocates
 * a dirty root struct and adds it into the list of dead roots that need to
 * be deleted
 */
781
int btrfs_add_dead_root(struct btrfs_root *root)
782
{
J
Josef Bacik 已提交
783
	spin_lock(&root->fs_info->trans_lock);
784
	list_add(&root->root_list, &root->fs_info->dead_roots);
J
Josef Bacik 已提交
785
	spin_unlock(&root->fs_info->trans_lock);
786 787 788
	return 0;
}

C
Chris Mason 已提交
789
/*
790
 * update all the cowonly tree roots on disk
C
Chris Mason 已提交
791
 */
792 793
static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
				    struct btrfs_root *root)
794 795
{
	struct btrfs_root *gang[8];
796
	struct btrfs_fs_info *fs_info = root->fs_info;
797 798
	int i;
	int ret;
799 800
	int err = 0;

J
Josef Bacik 已提交
801
	spin_lock(&fs_info->fs_roots_radix_lock);
C
Chris Mason 已提交
802
	while (1) {
803 804
		ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
						 (void **)gang, 0,
805 806 807 808 809 810
						 ARRAY_SIZE(gang),
						 BTRFS_ROOT_TRANS_TAG);
		if (ret == 0)
			break;
		for (i = 0; i < ret; i++) {
			root = gang[i];
811 812 813
			radix_tree_tag_clear(&fs_info->fs_roots_radix,
					(unsigned long)root->root_key.objectid,
					BTRFS_ROOT_TRANS_TAG);
J
Josef Bacik 已提交
814
			spin_unlock(&fs_info->fs_roots_radix_lock);
Y
Yan Zheng 已提交
815

816
			btrfs_free_log(trans, root);
817
			btrfs_update_reloc_root(trans, root);
818
			btrfs_orphan_commit_root(trans, root);
819

820 821
			btrfs_save_ino_cache(root, trans);

822 823 824 825
			/* see comments in should_cow_block() */
			root->force_cow = 0;
			smp_wmb();

826
			if (root->commit_root != root->node) {
827
				mutex_lock(&root->fs_commit_mutex);
J
Josef Bacik 已提交
828
				switch_commit_root(root);
829 830 831
				btrfs_unpin_free_ino(root);
				mutex_unlock(&root->fs_commit_mutex);

832 833 834
				btrfs_set_root_node(&root->root_item,
						    root->node);
			}
835 836

			err = btrfs_update_root(trans, fs_info->tree_root,
837 838
						&root->root_key,
						&root->root_item);
J
Josef Bacik 已提交
839
			spin_lock(&fs_info->fs_roots_radix_lock);
840 841
			if (err)
				break;
842 843
		}
	}
J
Josef Bacik 已提交
844
	spin_unlock(&fs_info->fs_roots_radix_lock);
845
	return err;
846 847
}

C
Chris Mason 已提交
848 849 850 851
/*
 * defrag a given btree.  If cacheonly == 1, this won't read from the disk,
 * otherwise every leaf in the btree is read and defragged.
 */
852 853 854 855
int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
{
	struct btrfs_fs_info *info = root->fs_info;
	struct btrfs_trans_handle *trans;
856
	int ret;
857
	unsigned long nr;
858

859
	if (xchg(&root->defrag_running, 1))
860
		return 0;
861

862
	while (1) {
863 864 865 866
		trans = btrfs_start_transaction(root, 0);
		if (IS_ERR(trans))
			return PTR_ERR(trans);

867
		ret = btrfs_defrag_leaves(trans, root, cacheonly);
868

869
		nr = trans->blocks_used;
870
		btrfs_end_transaction(trans, root);
871
		btrfs_btree_balance_dirty(info->tree_root, nr);
872 873
		cond_resched();

874
		if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
875 876 877
			break;
	}
	root->defrag_running = 0;
878
	return ret;
879 880
}

C
Chris Mason 已提交
881 882 883 884
/*
 * new snapshots need to be created at a very specific time in the
 * transaction commit.  This does the actual creation
 */
885
static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
886 887 888 889
				   struct btrfs_fs_info *fs_info,
				   struct btrfs_pending_snapshot *pending)
{
	struct btrfs_key key;
890
	struct btrfs_root_item *new_root_item;
891 892
	struct btrfs_root *tree_root = fs_info->tree_root;
	struct btrfs_root *root = pending->root;
893
	struct btrfs_root *parent_root;
L
Liu Bo 已提交
894
	struct btrfs_block_rsv *rsv;
895
	struct inode *parent_inode;
896
	struct dentry *parent;
897
	struct dentry *dentry;
898
	struct extent_buffer *tmp;
899
	struct extent_buffer *old;
900
	int ret;
901
	u64 to_reserve = 0;
902
	u64 index = 0;
903
	u64 objectid;
L
Li Zefan 已提交
904
	u64 root_flags;
905

L
Liu Bo 已提交
906 907
	rsv = trans->block_rsv;

908 909
	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
	if (!new_root_item) {
910
		ret = pending->error = -ENOMEM;
911 912
		goto fail;
	}
913

914
	ret = btrfs_find_free_objectid(tree_root, &objectid);
915 916
	if (ret) {
		pending->error = ret;
917
		goto fail;
918
	}
919

920
	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
921 922

	if (to_reserve > 0) {
923 924
		ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv,
						  to_reserve);
925 926 927 928 929 930
		if (ret) {
			pending->error = ret;
			goto fail;
		}
	}

931
	key.objectid = objectid;
932 933
	key.offset = (u64)-1;
	key.type = BTRFS_ROOT_ITEM_KEY;
934

935
	trans->block_rsv = &pending->block_rsv;
936

937
	dentry = pending->dentry;
938 939
	parent = dget_parent(dentry);
	parent_inode = parent->d_inode;
940
	parent_root = BTRFS_I(parent_inode)->root;
C
Chris Mason 已提交
941
	record_root_in_trans(trans, parent_root);
942

943 944 945
	/*
	 * insert the directory item
	 */
946
	ret = btrfs_set_inode_index(parent_inode, &index);
947
	BUG_ON(ret); /* -ENOMEM */
948
	ret = btrfs_insert_dir_item(trans, parent_root,
949
				dentry->d_name.name, dentry->d_name.len,
950
				parent_inode, &key,
951
				BTRFS_FT_DIR, index);
952 953 954 955
	if (ret) {
		pending->error = -EEXIST;
		dput(parent);
		goto fail;
956 957
	} else if (ret)
		goto abort_trans;
958

959 960
	btrfs_i_size_write(parent_inode, parent_inode->i_size +
					 dentry->d_name.len * 2);
961
	ret = btrfs_update_inode(trans, parent_root, parent_inode);
962 963
	if (ret)
		goto abort_trans;
964

965 966 967 968 969 970 971
	/*
	 * pull in the delayed directory update
	 * and the delayed inode item
	 * otherwise we corrupt the FS during
	 * snapshot
	 */
	ret = btrfs_run_delayed_items(trans, root);
972 973
	if (ret) /* Transaction aborted */
		goto fail;
974

C
Chris Mason 已提交
975
	record_root_in_trans(trans, root);
976 977
	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
978
	btrfs_check_and_init_root_item(new_root_item);
979

L
Li Zefan 已提交
980 981 982 983 984 985 986
	root_flags = btrfs_root_flags(new_root_item);
	if (pending->readonly)
		root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
	else
		root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
	btrfs_set_root_flags(new_root_item, root_flags);

987
	old = btrfs_lock_root_node(root);
988 989 990 991
	ret = btrfs_cow_block(trans, root, old, NULL, 0, &old);
	if (ret)
		goto abort_trans;

992 993
	btrfs_set_lock_blocking(old);

994 995 996 997
	ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
	if (ret)
		goto abort_trans;

998 999 1000
	btrfs_tree_unlock(old);
	free_extent_buffer(old);

1001 1002 1003 1004
	/* see comments in should_cow_block() */
	root->force_cow = 1;
	smp_wmb();

1005
	btrfs_set_root_node(new_root_item, tmp);
1006 1007 1008
	/* record when the snapshot was created in key.offset */
	key.offset = trans->transid;
	ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
1009 1010
	btrfs_tree_unlock(tmp);
	free_extent_buffer(tmp);
1011 1012
	if (ret)
		goto abort_trans;
1013

1014 1015 1016 1017
	/*
	 * insert root back/forward references
	 */
	ret = btrfs_add_root_ref(trans, tree_root, objectid,
1018
				 parent_root->root_key.objectid,
L
Li Zefan 已提交
1019
				 btrfs_ino(parent_inode), index,
1020
				 dentry->d_name.name, dentry->d_name.len);
1021 1022
	if (ret)
		goto fail;
1023
	dput(parent);
1024

1025 1026
	key.offset = (u64)-1;
	pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
1027 1028
	if (IS_ERR(pending->snap))
		goto abort_trans;
1029

1030 1031 1032 1033
	ret = btrfs_reloc_post_snapshot(trans, pending);
	if (ret)
		goto abort_trans;
	ret = 0;
1034
fail:
1035
	kfree(new_root_item);
L
Liu Bo 已提交
1036
	trans->block_rsv = rsv;
1037
	btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
1038 1039 1040 1041 1042
	return ret;

abort_trans:
	btrfs_abort_transaction(trans, root, ret);
	goto fail;
1043 1044
}

C
Chris Mason 已提交
1045 1046 1047
/*
 * create all the snapshots we've scheduled for creation
 */
1048 1049
static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
					     struct btrfs_fs_info *fs_info)
1050 1051 1052 1053
{
	struct btrfs_pending_snapshot *pending;
	struct list_head *head = &trans->transaction->pending_snapshots;

1054 1055
	list_for_each_entry(pending, head, list)
		create_pending_snapshot(trans, fs_info, pending);
1056 1057 1058
	return 0;
}

1059 1060 1061 1062 1063
static void update_super_roots(struct btrfs_root *root)
{
	struct btrfs_root_item *root_item;
	struct btrfs_super_block *super;

1064
	super = root->fs_info->super_copy;
1065 1066 1067 1068 1069 1070 1071 1072 1073 1074

	root_item = &root->fs_info->chunk_root->root_item;
	super->chunk_root = root_item->bytenr;
	super->chunk_root_generation = root_item->generation;
	super->chunk_root_level = root_item->level;

	root_item = &root->fs_info->tree_root->root_item;
	super->root = root_item->bytenr;
	super->generation = root_item->generation;
	super->root_level = root_item->level;
1075
	if (btrfs_test_opt(root, SPACE_CACHE))
1076
		super->cache_generation = root_item->generation;
1077 1078
}

1079 1080 1081
int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
{
	int ret = 0;
J
Josef Bacik 已提交
1082
	spin_lock(&info->trans_lock);
1083 1084
	if (info->running_transaction)
		ret = info->running_transaction->in_commit;
J
Josef Bacik 已提交
1085
	spin_unlock(&info->trans_lock);
1086 1087 1088
	return ret;
}

1089 1090 1091
int btrfs_transaction_blocked(struct btrfs_fs_info *info)
{
	int ret = 0;
J
Josef Bacik 已提交
1092
	spin_lock(&info->trans_lock);
1093 1094
	if (info->running_transaction)
		ret = info->running_transaction->blocked;
J
Josef Bacik 已提交
1095
	spin_unlock(&info->trans_lock);
1096 1097 1098
	return ret;
}

S
Sage Weil 已提交
1099 1100 1101 1102 1103 1104 1105
/*
 * wait for the current transaction commit to start and block subsequent
 * transaction joins
 */
static void wait_current_trans_commit_start(struct btrfs_root *root,
					    struct btrfs_transaction *trans)
{
L
Li Zefan 已提交
1106
	wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit);
S
Sage Weil 已提交
1107 1108 1109 1110 1111 1112 1113 1114 1115
}

/*
 * wait for the current transaction to start and then become unblocked.
 * caller holds ref.
 */
static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
					 struct btrfs_transaction *trans)
{
L
Li Zefan 已提交
1116 1117
	wait_event(root->fs_info->transaction_wait,
		   trans->commit_done || (trans->in_commit && !trans->blocked));
S
Sage Weil 已提交
1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146
}

/*
 * commit transactions asynchronously. once btrfs_commit_transaction_async
 * returns, any subsequent transaction will not be allowed to join.
 */
struct btrfs_async_commit {
	struct btrfs_trans_handle *newtrans;
	struct btrfs_root *root;
	struct delayed_work work;
};

static void do_async_commit(struct work_struct *work)
{
	struct btrfs_async_commit *ac =
		container_of(work, struct btrfs_async_commit, work.work);

	btrfs_commit_transaction(ac->newtrans, ac->root);
	kfree(ac);
}

int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root,
				   int wait_for_unblock)
{
	struct btrfs_async_commit *ac;
	struct btrfs_transaction *cur_trans;

	ac = kmalloc(sizeof(*ac), GFP_NOFS);
T
Tsutomu Itoh 已提交
1147 1148
	if (!ac)
		return -ENOMEM;
S
Sage Weil 已提交
1149 1150 1151

	INIT_DELAYED_WORK(&ac->work, do_async_commit);
	ac->root = root;
1152
	ac->newtrans = btrfs_join_transaction(root);
1153 1154 1155 1156 1157
	if (IS_ERR(ac->newtrans)) {
		int err = PTR_ERR(ac->newtrans);
		kfree(ac);
		return err;
	}
S
Sage Weil 已提交
1158 1159 1160

	/* take transaction reference */
	cur_trans = trans->transaction;
1161
	atomic_inc(&cur_trans->use_count);
S
Sage Weil 已提交
1162 1163 1164 1165 1166 1167 1168 1169 1170 1171

	btrfs_end_transaction(trans, root);
	schedule_delayed_work(&ac->work, 0);

	/* wait for transaction to start and unblock */
	if (wait_for_unblock)
		wait_current_trans_commit_start_and_unblock(root, cur_trans);
	else
		wait_current_trans_commit_start(root, cur_trans);

1172 1173 1174 1175
	if (current->journal_info == trans)
		current->journal_info = NULL;

	put_transaction(cur_trans);
S
Sage Weil 已提交
1176 1177 1178
	return 0;
}

1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205

static void cleanup_transaction(struct btrfs_trans_handle *trans,
				struct btrfs_root *root)
{
	struct btrfs_transaction *cur_trans = trans->transaction;

	WARN_ON(trans->use_count > 1);

	spin_lock(&root->fs_info->trans_lock);
	list_del_init(&cur_trans->list);
	spin_unlock(&root->fs_info->trans_lock);

	btrfs_cleanup_one_transaction(trans->transaction, root);

	put_transaction(cur_trans);
	put_transaction(cur_trans);

	trace_btrfs_transaction_commit(root);

	btrfs_scrub_continue(root);

	if (current->journal_info == trans)
		current->journal_info = NULL;

	kmem_cache_free(btrfs_trans_handle_cachep, trans);
}

S
Sage Weil 已提交
1206 1207 1208 1209 1210 1211 1212
/*
 * btrfs_transaction state sequence:
 *    in_commit = 0, blocked = 0  (initial)
 *    in_commit = 1, blocked = 1
 *    blocked = 0
 *    commit_done = 1
 */
C
Chris Mason 已提交
1213 1214 1215
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
			     struct btrfs_root *root)
{
1216
	unsigned long joined = 0;
1217
	struct btrfs_transaction *cur_trans = trans->transaction;
C
Chris Mason 已提交
1218
	struct btrfs_transaction *prev_trans = NULL;
C
Chris Mason 已提交
1219
	DEFINE_WAIT(wait);
1220
	int ret = -EIO;
1221 1222
	int should_grow = 0;
	unsigned long now = get_seconds();
1223
	int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
C
Chris Mason 已提交
1224

1225 1226
	btrfs_run_ordered_operations(root, 0);

1227
	btrfs_trans_release_metadata(trans, root);
1228 1229
	trans->block_rsv = NULL;

1230 1231 1232
	if (cur_trans->aborted)
		goto cleanup_transaction;

1233 1234 1235 1236
	/* make a pass through all the delayed refs we have so far
	 * any runnings procs may add more while we are here
	 */
	ret = btrfs_run_delayed_refs(trans, root, 0);
1237 1238
	if (ret)
		goto cleanup_transaction;
1239

1240
	cur_trans = trans->transaction;
1241

1242 1243 1244 1245
	/*
	 * set the flushing flag so procs in this transaction have to
	 * start sending their work down.
	 */
1246
	cur_trans->delayed_refs.flushing = 1;
1247

1248
	ret = btrfs_run_delayed_refs(trans, root, 0);
1249 1250
	if (ret)
		goto cleanup_transaction;
1251

J
Josef Bacik 已提交
1252
	spin_lock(&cur_trans->commit_lock);
1253
	if (cur_trans->in_commit) {
J
Josef Bacik 已提交
1254
		spin_unlock(&cur_trans->commit_lock);
1255
		atomic_inc(&cur_trans->use_count);
1256
		ret = btrfs_end_transaction(trans, root);
C
Chris Mason 已提交
1257

1258
		wait_for_commit(root, cur_trans);
1259

C
Chris Mason 已提交
1260
		put_transaction(cur_trans);
1261

1262
		return ret;
C
Chris Mason 已提交
1263
	}
1264

C
Chris Mason 已提交
1265
	trans->transaction->in_commit = 1;
1266
	trans->transaction->blocked = 1;
J
Josef Bacik 已提交
1267
	spin_unlock(&cur_trans->commit_lock);
S
Sage Weil 已提交
1268 1269
	wake_up(&root->fs_info->transaction_blocked_wait);

J
Josef Bacik 已提交
1270
	spin_lock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1271 1272 1273 1274
	if (cur_trans->list.prev != &root->fs_info->trans_list) {
		prev_trans = list_entry(cur_trans->list.prev,
					struct btrfs_transaction, list);
		if (!prev_trans->commit_done) {
1275
			atomic_inc(&prev_trans->use_count);
J
Josef Bacik 已提交
1276
			spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1277 1278 1279

			wait_for_commit(root, prev_trans);

1280
			put_transaction(prev_trans);
J
Josef Bacik 已提交
1281 1282
		} else {
			spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1283
		}
J
Josef Bacik 已提交
1284 1285
	} else {
		spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1286
	}
1287

1288 1289 1290
	if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
		should_grow = 1;

1291
	do {
1292
		int snap_pending = 0;
J
Josef Bacik 已提交
1293

1294
		joined = cur_trans->num_joined;
1295 1296 1297
		if (!list_empty(&trans->transaction->pending_snapshots))
			snap_pending = 1;

C
Chris Mason 已提交
1298
		WARN_ON(cur_trans != trans->transaction);
1299

1300
		if (flush_on_commit || snap_pending) {
Y
Yan, Zheng 已提交
1301
			btrfs_start_delalloc_inodes(root, 1);
1302
			btrfs_wait_ordered_extents(root, 0, 1);
1303 1304
		}

1305
		ret = btrfs_run_delayed_items(trans, root);
1306 1307
		if (ret)
			goto cleanup_transaction;
1308

1309 1310 1311 1312 1313 1314 1315 1316 1317
		/*
		 * rename don't use btrfs_join_transaction, so, once we
		 * set the transaction to blocked above, we aren't going
		 * to get any new ordered operations.  We can safely run
		 * it here and no for sure that nothing new will be added
		 * to the list
		 */
		btrfs_run_ordered_operations(root, 1);

1318 1319 1320
		prepare_to_wait(&cur_trans->writer_wait, &wait,
				TASK_UNINTERRUPTIBLE);

1321
		if (atomic_read(&cur_trans->num_writers) > 1)
1322 1323 1324
			schedule_timeout(MAX_SCHEDULE_TIMEOUT);
		else if (should_grow)
			schedule_timeout(1);
1325 1326

		finish_wait(&cur_trans->writer_wait, &wait);
1327
	} while (atomic_read(&cur_trans->num_writers) > 1 ||
1328
		 (should_grow && cur_trans->num_joined != joined));
1329

1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340
	/*
	 * Ok now we need to make sure to block out any other joins while we
	 * commit the transaction.  We could have started a join before setting
	 * no_join so make sure to wait for num_writers to == 1 again.
	 */
	spin_lock(&root->fs_info->trans_lock);
	root->fs_info->trans_no_join = 1;
	spin_unlock(&root->fs_info->trans_lock);
	wait_event(cur_trans->writer_wait,
		   atomic_read(&cur_trans->num_writers) == 1);

C
Chris Mason 已提交
1341 1342 1343 1344 1345 1346 1347
	/*
	 * the reloc mutex makes sure that we stop
	 * the balancing code from coming in and moving
	 * extents around in the middle of the commit
	 */
	mutex_lock(&root->fs_info->reloc_mutex);

1348
	ret = btrfs_run_delayed_items(trans, root);
1349 1350 1351 1352
	if (ret) {
		mutex_unlock(&root->fs_info->reloc_mutex);
		goto cleanup_transaction;
	}
1353

1354
	ret = create_pending_snapshots(trans, root->fs_info);
1355 1356 1357 1358
	if (ret) {
		mutex_unlock(&root->fs_info->reloc_mutex);
		goto cleanup_transaction;
	}
1359

1360
	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1361 1362 1363 1364
	if (ret) {
		mutex_unlock(&root->fs_info->reloc_mutex);
		goto cleanup_transaction;
	}
1365

1366 1367 1368 1369 1370 1371
	/*
	 * make sure none of the code above managed to slip in a
	 * delayed item
	 */
	btrfs_assert_delayed_root_empty(root);

C
Chris Mason 已提交
1372
	WARN_ON(cur_trans != trans->transaction);
C
Chris Mason 已提交
1373

A
Arne Jansen 已提交
1374
	btrfs_scrub_pause(root);
1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389
	/* btrfs_commit_tree_roots is responsible for getting the
	 * various roots consistent with each other.  Every pointer
	 * in the tree of tree roots has to point to the most up to date
	 * root for every subvolume and other tree.  So, we have to keep
	 * the tree logging code from jumping in and changing any
	 * of the trees.
	 *
	 * At this point in the commit, there can't be any tree-log
	 * writers, but a little lower down we drop the trans mutex
	 * and let new people in.  By holding the tree_log_mutex
	 * from now until after the super is written, we avoid races
	 * with the tree-log code.
	 */
	mutex_lock(&root->fs_info->tree_log_mutex);

1390
	ret = commit_fs_roots(trans, root);
1391 1392 1393 1394
	if (ret) {
		mutex_unlock(&root->fs_info->tree_log_mutex);
		goto cleanup_transaction;
	}
1395

1396
	/* commit_fs_roots gets rid of all the tree log roots, it is now
1397 1398 1399 1400
	 * safe to free the root of tree log roots
	 */
	btrfs_free_log_root_tree(trans, root->fs_info);

1401
	ret = commit_cowonly_roots(trans, root);
1402 1403 1404 1405
	if (ret) {
		mutex_unlock(&root->fs_info->tree_log_mutex);
		goto cleanup_transaction;
	}
1406

1407 1408
	btrfs_prepare_extent_commit(trans, root);

C
Chris Mason 已提交
1409
	cur_trans = root->fs_info->running_transaction;
1410 1411 1412

	btrfs_set_root_node(&root->fs_info->tree_root->root_item,
			    root->fs_info->tree_root->node);
J
Josef Bacik 已提交
1413
	switch_commit_root(root->fs_info->tree_root);
1414 1415 1416

	btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
			    root->fs_info->chunk_root->node);
J
Josef Bacik 已提交
1417
	switch_commit_root(root->fs_info->chunk_root);
1418 1419

	update_super_roots(root);
1420 1421

	if (!root->fs_info->log_root_recovering) {
1422 1423
		btrfs_set_super_log_root(root->fs_info->super_copy, 0);
		btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
1424 1425
	}

1426 1427
	memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
	       sizeof(*root->fs_info->super_copy));
C
Chris Mason 已提交
1428

1429
	trans->transaction->blocked = 0;
J
Josef Bacik 已提交
1430 1431 1432 1433
	spin_lock(&root->fs_info->trans_lock);
	root->fs_info->running_transaction = NULL;
	root->fs_info->trans_no_join = 0;
	spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1434
	mutex_unlock(&root->fs_info->reloc_mutex);
1435

1436
	wake_up(&root->fs_info->transaction_wait);
1437

C
Chris Mason 已提交
1438
	ret = btrfs_write_and_wait_transaction(trans, root);
1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450
	if (ret) {
		btrfs_error(root->fs_info, ret,
			    "Error while writing out transaction.");
		mutex_unlock(&root->fs_info->tree_log_mutex);
		goto cleanup_transaction;
	}

	ret = write_ctree_super(trans, root, 0);
	if (ret) {
		mutex_unlock(&root->fs_info->tree_log_mutex);
		goto cleanup_transaction;
	}
1451

1452 1453 1454 1455 1456 1457
	/*
	 * the super is written, we can safely allow the tree-loggers
	 * to go about their business
	 */
	mutex_unlock(&root->fs_info->tree_log_mutex);

1458
	btrfs_finish_extent_commit(trans, root);
1459

C
Chris Mason 已提交
1460
	cur_trans->commit_done = 1;
1461

1462
	root->fs_info->last_trans_committed = cur_trans->transid;
J
Josef Bacik 已提交
1463

C
Chris Mason 已提交
1464
	wake_up(&cur_trans->commit_wait);
1465

J
Josef Bacik 已提交
1466
	spin_lock(&root->fs_info->trans_lock);
1467
	list_del_init(&cur_trans->list);
J
Josef Bacik 已提交
1468 1469
	spin_unlock(&root->fs_info->trans_lock);

C
Chris Mason 已提交
1470
	put_transaction(cur_trans);
C
Chris Mason 已提交
1471
	put_transaction(cur_trans);
1472

1473 1474
	trace_btrfs_transaction_commit(root);

A
Arne Jansen 已提交
1475 1476
	btrfs_scrub_continue(root);

J
Josef Bacik 已提交
1477 1478 1479
	if (current->journal_info == trans)
		current->journal_info = NULL;

C
Chris Mason 已提交
1480
	kmem_cache_free(btrfs_trans_handle_cachep, trans);
Y
Yan, Zheng 已提交
1481 1482 1483 1484

	if (current != root->fs_info->transaction_kthread)
		btrfs_run_delayed_iputs(root);

C
Chris Mason 已提交
1485
	return ret;
1486 1487 1488 1489 1490 1491 1492 1493 1494

cleanup_transaction:
	btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n");
//	WARN_ON(1);
	if (current->journal_info == trans)
		current->journal_info = NULL;
	cleanup_transaction(trans, root);

	return ret;
C
Chris Mason 已提交
1495 1496
}

C
Chris Mason 已提交
1497 1498 1499
/*
 * interface function to delete all the snapshots we have scheduled for deletion
 */
1500 1501
int btrfs_clean_old_snapshots(struct btrfs_root *root)
{
1502 1503 1504
	LIST_HEAD(list);
	struct btrfs_fs_info *fs_info = root->fs_info;

J
Josef Bacik 已提交
1505
	spin_lock(&fs_info->trans_lock);
1506
	list_splice_init(&fs_info->dead_roots, &list);
J
Josef Bacik 已提交
1507
	spin_unlock(&fs_info->trans_lock);
1508

1509
	while (!list_empty(&list)) {
1510 1511
		int ret;

1512
		root = list_entry(list.next, struct btrfs_root, root_list);
1513 1514
		list_del(&root->root_list);

1515 1516
		btrfs_kill_all_delayed_nodes(root);

1517 1518
		if (btrfs_header_backref_rev(root->node) <
		    BTRFS_MIXED_BACKREF_REV)
1519
			ret = btrfs_drop_snapshot(root, NULL, 0, 0);
1520
		else
1521 1522
			ret =btrfs_drop_snapshot(root, NULL, 1, 0);
		BUG_ON(ret < 0);
1523 1524 1525
	}
	return 0;
}